diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index 8cad660523ecc..9ce80a71eb950 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -49,19 +49,23 @@ COPY --from=build /app/full /app WORKDIR /app -RUN apt-get update \ - && apt-get install -y \ - git \ - python3 \ - python3-pip \ - && pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete - +RUN apt-get update && \ + apt-get install -y \ + git \ + python3 \ + python3-pip \ + python3-venv && \ + python3 -m venv /opt/venv && \ + . /opt/venv/bin/activate && \ + pip install --upgrade pip setuptools wheel && \ + pip install -r requirements.txt && \ + apt autoremove -y && \ + apt clean -y && \ + rm -rf /tmp/* /var/tmp/* && \ + find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ + find /var/cache -type f -delete + +ENV PATH="/opt/venv/bin:$PATH" ENTRYPOINT ["/app/tools.sh"] diff --git a/.github/labeler.yml b/.github/labeler.yml index 278032ef2e1a4..3c2f67707b024 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -86,3 +86,10 @@ nix: embedding: - changed-files: - any-glob-to-any-file: examples/embedding/ + +Ascend NPU: + - changed-files: + - any-glob-to-any-file: + - ggml/include/ggml-cann.h + - ggml/src/ggml-cann/** + - docs/backend/CANN.md diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml new file mode 100644 index 0000000000000..fee2ab96bd0e8 --- /dev/null +++ b/.github/workflows/build-cmake-pkg.yml @@ -0,0 +1,51 @@ +name: Build relocatable cmake package +on: + workflow_dispatch: + workflow_call: + +jobs: + linux: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y build-essential tcl + + - name: Build + run: | + PREFIX="$(pwd)"/inst + cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \ + -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release + cmake --build build --config Release + cmake --install build --prefix "$PREFIX" --config Release + + export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake + tclsh <<'EOF' + set build(commit) [string trim [exec git rev-parse --short HEAD]] + set build(number) [string trim [exec git rev-list --count HEAD]] + set build(version) "0.0.$build(number)" + + set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]] + set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \ + "set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \ + "set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"] + + puts -nonewline "Checking llama-config.cmake version... " + foreach check $checks { + if {![regexp -expanded -- $check $llamaconfig]} { + puts "\"$check\" failed!" + exit 1 + } + } + puts "success." + EOF + + cd examples/simple-cmake-pkg + cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake + cmake --build build diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml index 92dc41f9d729c..7cfc82ba4e277 100644 --- a/.github/workflows/build-linux-cross.yml +++ b/.github/workflows/build-linux-cross.yml @@ -231,3 +231,116 @@ jobs: -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH cmake --build build --config Release -j $(nproc) + + debian-13-loongarch64-cpu-cross: + runs-on: ubuntu-24.04 + container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671 + + steps: + - uses: actions/checkout@v4 + - name: Setup LoongArch + run: | + rm -f /etc/apt/sources.list.d/* + cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list + deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main + EOF + ( echo 'quiet "true";'; \ + echo 'APT::Get::Assume-Yes "true";'; \ + echo 'APT::Install-Recommends "false";'; \ + echo 'Acquire::Check-Valid-Until "false";'; \ + echo 'Acquire::Retries "5";'; \ + ) > /etc/apt/apt.conf.d/99snapshot-repos + + apt-get update + apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip + dpkg --add-architecture loong64 + + # Add arch-specific repositories for non-amd64 architectures + cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list + deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main + EOF + + apt-get update || true ;# Prevent failure due to missing URLs. + + apt-get install -y --no-install-recommends \ + build-essential \ + gcc-14-loongarch64-linux-gnu \ + g++-14-loongarch64-linux-gnu + + - name: Build + run: | + cmake -B build -DLLAMA_CURL=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \ + -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH + + cmake --build build --config Release -j $(nproc) + + debian-13-loongarch64-vulkan-cross: + runs-on: ubuntu-24.04 + container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671 + + steps: + - uses: actions/checkout@v4 + - name: Setup LoongArch + run: | + rm -f /etc/apt/sources.list.d/* + cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list + deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main + EOF + ( echo 'quiet "true";'; \ + echo 'APT::Get::Assume-Yes "true";'; \ + echo 'APT::Install-Recommends "false";'; \ + echo 'Acquire::Check-Valid-Until "false";'; \ + echo 'Acquire::Retries "5";'; \ + ) > /etc/apt/apt.conf.d/99snapshot-repos + + apt-get update + apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip + dpkg --add-architecture loong64 + + # Add arch-specific repositories for non-amd64 architectures + cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list + deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main + EOF + + apt-get update || true ;# Prevent failure due to missing URLs. + + apt-get install -y --no-install-recommends \ + build-essential \ + glslc \ + gcc-14-loongarch64-linux-gnu \ + g++-14-loongarch64-linux-gnu \ + libvulkan-dev:loong64 + + - name: Build + run: | + cmake -B build -DLLAMA_CURL=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_VULKAN=ON \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \ + -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH + + cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ee76d1799e6f4..4feccf21e9e3e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -5,10 +5,43 @@ on: push: branches: - master - paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] + paths: [ + '.github/workflows/build.yml', + '.github/workflows/build-linux-cross.yml', + '.github/workflows/build-cmake-pkg.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + '**/*.swift', + '**/*.m', + '**/*.metal', + '**/*.comp' + ] + pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] + paths: [ + '.github/workflows/build.yml', + '.github/workflows/build-linux-cross.yml', + '.github/workflows/build-cmake-pkg.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + '**/*.swift', + '**/*.m', + '**/*.metal', + '**/*.comp' + ] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} @@ -306,6 +339,7 @@ jobs: id: cmake_test run: | cd build + export GGML_VK_VISIBLE_DEVICES=0 # This is using llvmpipe and runs slower than other backends ctest -L main --verbose --timeout 3600 @@ -477,6 +511,9 @@ jobs: build-linux-cross: uses: ./.github/workflows/build-linux-cross.yml + build-cmake-pkg: + uses: ./.github/workflows/build-cmake-pkg.yml + macOS-latest-cmake-ios: runs-on: macos-latest @@ -682,17 +719,17 @@ jobs: env: OPENBLAS_VERSION: 0.3.23 SDE_VERSION: 9.33.0-2024-01-07 - VULKAN_VERSION: 1.4.309.0 + VULKAN_VERSION: 1.4.313.2 strategy: matrix: include: - - build: 'cpu-x64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF' + - build: 'cpu-x64 (static)' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF' - build: 'openblas-x64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'vulkan-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' + defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' - build: 'llvm-arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - build: 'llvm-arm64-opencl-adreno' @@ -735,7 +772,7 @@ jobs: id: get_vulkan if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }} run: | - curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" + curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" @@ -777,6 +814,7 @@ jobs: cmake -S . -B build ${{ matrix.defines }} ` -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} + cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release - name: Add libopenblas.dll id: add_libopenblas_dll @@ -839,12 +877,12 @@ jobs: -DGGML_CUDA=ON cmake --build build - windows-2019-cmake-cuda: - runs-on: windows-2019 + windows-2022-cmake-cuda: + runs-on: windows-2022 strategy: matrix: - cuda: ['12.4', '11.7'] + cuda: ['12.4'] steps: - name: Clone @@ -878,7 +916,7 @@ jobs: env: CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 cmake -S . -B build -G "Ninja Multi-Config" ^ -DLLAMA_BUILD_SERVER=ON ^ -DGGML_NATIVE=OFF ^ diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 65ed244657e4f..64fff175e227b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -131,8 +131,9 @@ jobs: include: - build: 'x64' os: ubuntu-22.04 - - build: 'arm64' - os: ubuntu-22.04-arm + # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm + # - build: 'arm64' + # os: ubuntu-22.04-arm runs-on: ${{ matrix.os }} @@ -159,6 +160,9 @@ jobs: id: cmake_build run: | cmake -B build \ + -DGGML_BACKEND_DL=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_CPU_ALL_VARIANTS=ON \ -DLLAMA_FATAL_WARNINGS=ON \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -207,6 +211,9 @@ jobs: id: cmake_build run: | cmake -B build \ + -DGGML_BACKEND_DL=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_CPU_ALL_VARIANTS=ON \ -DGGML_VULKAN=ON \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -295,7 +302,7 @@ jobs: env: OPENBLAS_VERSION: 0.3.23 - VULKAN_VERSION: 1.4.309.0 + VULKAN_VERSION: 1.4.313.2 strategy: matrix: @@ -325,7 +332,7 @@ jobs: id: get_vulkan if: ${{ matrix.backend == 'vulkan' }} run: | - curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" + curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" @@ -373,11 +380,11 @@ jobs: name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip windows-cuda: - runs-on: windows-2019 + runs-on: windows-2022 strategy: matrix: - cuda: ['12.4', '11.7'] + cuda: ['12.4'] steps: - name: Clone @@ -405,7 +412,7 @@ jobs: id: cmake_build shell: cmd run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 cmake -S . -B build -G "Ninja Multi-Config" ^ -DGGML_BACKEND_DL=ON ^ -DGGML_NATIVE=OFF ^ diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 4baf6f6c755ee..f6da488576937 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -180,7 +180,7 @@ jobs: server-windows: - runs-on: windows-2019 + runs-on: windows-2022 steps: - name: Clone diff --git a/.gitignore b/.gitignore index f8ceb1560a1df..803a3b1d31d8a 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,10 @@ poetry.toml # Local scripts /run-vim.sh /run-chat.sh + +HEXAGON_Tools/ +prebuilts/QNN_SDK/qairt/2.35.0.250530/ +prebuilts/QNN_SDK/v2.35.0.250530.zip +prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz + + diff --git a/CMakeLists.txt b/CMakeLists.txt index ac3e9090336d9..29e68ac82cbbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,21 @@ set(CMAKE_WARN_UNUSED_CLI YES) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + set(CMAKE_VERBOSE_MAKEFILE ON) + if(DEFINED HTP_ARCH_VERSION) + if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79") + #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend + set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -ffp-model=fast -fno-finite-math-only") + message("OPT_FLAG:${OPT_FLAG}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGGML_USE_HEXAGON -DGGML_USE_LLAMAFILE ${DEBUG_FLAG} ${OPT_FLAG}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON -DGGML_USE_LLAMAFILE ${DEBUG_FLAG} ${OPT_FLAG}") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON -DGGML_USE_LLAMAFILE ${DEBUG_FLAG} ${OPT_FLAG}") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON -DGGML_USE_LLAMAFILE ${DEBUG_FLAG} ${OPT_FLAG}") + endif() + endif() +endif() + if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") @@ -89,6 +104,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) +if (NOT DEFINED LLAMA_BUILD_NUMBER) + set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) +endif() +if (NOT DEFINED LLAMA_BUILD_COMMIT) + set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) +endif() +set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER}) + # override ggml options set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) @@ -120,6 +143,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_CANN GGML_CANN) +llama_option_depr(WARNING LLAMA_HEXAGON GGML_HEXAGON) if (NOT MSVC) if (LLAMA_SANITIZE_THREAD) @@ -155,10 +179,17 @@ if (LLAMA_USE_SYSTEM_GGML) endif() if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) + set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER}) + set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT}) add_subdirectory(ggml) # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() +if (MINGW) + # Target Windows 8 for PrefetchVirtualMemory + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) +endif() + # # build the library # @@ -199,10 +230,6 @@ endif() include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) -set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) -set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) - set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") diff --git a/Makefile b/Makefile index 958ad8f2fcc0a..ac442aec095d6 100644 --- a/Makefile +++ b/Makefile @@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL endif ifndef GGML_NO_CPU_AARCH64 - MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64 + MK_CPPFLAGS += -DGGML_USE_CPU_REPACK endif # warnings @@ -970,7 +970,7 @@ OBJ_GGML = \ $(DIR_GGML)/src/ggml-threading.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \ + $(DIR_GGML)/src/ggml-cpu/repack.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \ diff --git a/README.md b/README.md index 540c29a4f1847..90c7364dfcba0 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,10 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases) [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml) -[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) +[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ @@ -17,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md) -- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9) - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639 @@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ---- +## Quick start + +Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine: + +- Install `llama.cpp` using [brew, nix or winget](docs/install.md) +- Run with Docker - see our [Docker documentation](docs/docker.md) +- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases) +- Build from source by cloning this repository - check out [our build guide](docs/build.md) + +Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more. + +Example command: + +```sh +# Use a local model file +llama-cli -m my_model.gguf + +# Or download and run a model directly from Hugging Face +llama-cli -hf ggml-org/gemma-3-1b-it-GGUF + +# Launch OpenAI-compatible API server +llama-server -hf ggml-org/gemma-3-1b-it-GGUF +``` + ## Description The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide @@ -130,6 +154,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
Bindings +- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama) - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) @@ -229,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
+ ## Supported backends | Backend | Target devices | @@ -245,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU | | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | -## Building the project - -The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h). -The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries: - -- Clone this repository and build locally, see [how to build](docs/build.md) -- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md) -- Use a Docker image, see [documentation for Docker](docs/docker.md) -- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases) - ## Obtaining and quantizing models The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`: @@ -262,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt - [Trending](https://huggingface.co/models?library=gguf&sort=trending) - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf) -You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf /[:quant]`. +You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf /[:quant]`. For example: + +```sh +llama-cli -hf ggml-org/gemma-3-1b-it-GGUF +``` By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`. diff --git a/ci/run.sh b/ci/run.sh index b49a3a5f82357..e1b777c304eaf 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -39,14 +39,27 @@ sd=`dirname $0` cd $sd/../ SRC=`pwd` -CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF" +CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON" if [ ! -z ${GG_BUILD_METAL} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON" fi if [ ! -z ${GG_BUILD_CUDA} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON" + + if command -v nvidia-smi >/dev/null 2>&1; then + CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.') + if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then + CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}" + else + echo "Warning: Using fallback CUDA architectures" + CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89" + fi + else + echo "Error: nvidia-smi not found, cannot build with CUDA" + exit 1 + fi fi if [ ! -z ${GG_BUILD_SYCL} ]; then @@ -766,7 +779,7 @@ function gg_run_rerank_tiny { model_f16="${path_models}/ggml-model-f16.gguf" # for this model, the SEP token is "" - (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log + (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log # sample output # rerank score 0: 0.029 diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 564af1448f95a..f43a630c900ff 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -7,8 +7,8 @@ llama_add_compile_flags() # Build info header # -if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git") - set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git") +if(EXISTS "${PROJECT_SOURCE_DIR}/.git") + set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git") # Is git submodule if(NOT IS_DIRECTORY "${GIT_DIR}") @@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git") if (SLASH_POS EQUAL 0) set(GIT_DIR "${REAL_GIT_DIR}") else() - set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}") + set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}") endif() endif() if(EXISTS "${GIT_DIR}/index") - set(GIT_INDEX "${GIT_DIR}/index") + # For build-info.cpp below + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index") else() message(WARNING "Git index not found in git repository.") - set(GIT_INDEX "") endif() else() message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") - set(GIT_INDEX "") endif() -# Add a custom command to rebuild build-info.cpp when .git/index changes -add_custom_command( - OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp" - COMMENT "Generating build details from Git" - COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} - -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR} - -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake" - WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.." - DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX} - VERBATIM -) +set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in") +set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp") +configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) + set(TARGET build_info) -add_library(${TARGET} OBJECT build-info.cpp) +add_library(${TARGET} OBJECT ${OUTPUT_FILE}) if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() diff --git a/common/arg.cpp b/common/arg.cpp index cfa9878f90730..c4ad85c47b61b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.tensor_buft_overrides.push_back({nullptr, nullptr}); } - if (params.reranking && params.embedding) { - throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both"); - } - if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) { throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s%s\n", @@ -2710,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.embd_sep = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(common_arg( + {"--cls-separator"}, "STRING", + "separator of classification sequences (default \\t) for example \"<#seq#>\"", + [](common_params & params, const std::string & value) { + params.cls_sep = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--host"}, "HOST", string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()), @@ -2747,9 +2750,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(common_arg( {"--reranking", "--rerank"}, - string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), + string_format("enable reranking endpoint on server (default: %s)", "disabled"), [](common_params & params) { - params.reranking = true; + params.embedding = true; + params.pooling_type = LLAMA_POOLING_TYPE_RANK; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); add_opt(common_arg( @@ -2869,6 +2873,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "(default: deepseek)", [](common_params & params, const std::string & value) { /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } + else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; } else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } else { throw std::invalid_argument("invalid value"); } } @@ -3212,6 +3217,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.model.path = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); + add_opt(common_arg( + {"-ctkd", "--cache-type-k-draft"}, "TYPE", + string_format( + "KV cache data type for K for the draft model\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.speculative.cache_type_k) + ), + [](common_params & params, const std::string & value) { + params.speculative.cache_type_k = kv_cache_type_from_str(value); + } + ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT")); + add_opt(common_arg( + {"-ctvd", "--cache-type-v-draft"}, "TYPE", + string_format( + "KV cache data type for V for the draft model\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.speculative.cache_type_v) + ), + [](common_params & params, const std::string & value) { + params.speculative.cache_type_v = kv_cache_type_from_str(value); + } + ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT")); add_opt(common_arg( {"-mv", "--model-vocoder"}, "FNAME", diff --git a/common/build-info.cpp.in b/common/build-info.cpp.in index 0b945aa68fff3..aee9d7eafd681 100644 --- a/common/build-info.cpp.in +++ b/common/build-info.cpp.in @@ -1,4 +1,4 @@ -int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@; -char const *LLAMA_COMMIT = "@BUILD_COMMIT@"; +int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@; +char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@"; char const *LLAMA_COMPILER = "@BUILD_COMPILER@"; char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@"; diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index 65b664cb37da4..18a30e49aa578 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std:: // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str()); result_.tool_calls.emplace_back(tool_call); + return true; } bool common_chat_msg_parser::add_tool_call(const json & tool_call) { @@ -378,3 +379,7 @@ std::optional common_chat_msg_parse /* .is_partial = */ found_healing_marker, }; } + +void common_chat_msg_parser::clear_tools() { + result_.tool_calls.clear(); +} diff --git a/common/chat-parser.h b/common/chat-parser.h index 7ee355056b30a..0e64c341a50aa 100644 --- a/common/chat-parser.h +++ b/common/chat-parser.h @@ -115,4 +115,6 @@ class common_chat_msg_parser { const std::vector> & args_paths = {}, const std::vector> & content_paths = {} ); + + void clear_tools(); }; diff --git a/common/chat.cpp b/common/chat.cpp index f1ab4c85a913e..7d9aaeb12a190 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const std::vector common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) { std::vector diffs; - // if (previous_msg.reasoning_content != current.reasoning_content) { - // auto & diff = diffs.emplace_back(); - // diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content); - // } + if (previous_msg.reasoning_content != new_msg.reasoning_content) { + auto & diff = diffs.emplace_back(); + diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content); + } if (previous_msg.content != new_msg.content) { auto & diff = diffs.emplace_back(); diff.content_delta = string_diff(previous_msg.content, new_msg.content); @@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector & t template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) { json delta = json::object(); - // if (!diff.reasoning_content_delta.empty()) { - // delta["reasoning_content"] = msg.reasoning_content; - // } + if (!diff.reasoning_content_delta.empty()) { + delta["reasoning_content"] = diff.reasoning_content_delta; + } if (!diff.content_delta.empty()) { delta["content"] = diff.content_delta; } @@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) { switch (format) { case COMMON_REASONING_FORMAT_NONE: return "none"; case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; + case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy"; default: throw std::runtime_error("Unknown reasoning format"); } @@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy( if (res < 0) { // if the custom "tmpl" is not supported, we throw an error // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() - throw std::runtime_error("this custom template is not supported"); + throw std::runtime_error("this custom template is not supported, try using --jinja"); } // if it turns out that our buffer is too small, we resize it @@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co } catch (const common_chat_msg_partial_exception & ex) { LOG_DBG("Partial parse: %s\n", ex.what()); if (!is_partial) { - throw std::runtime_error(ex.what()); + builder.clear_tools(); + builder.move_to(0); + common_chat_parse_content_only(builder); } } auto msg = builder.result(); diff --git a/common/chat.h b/common/chat.h index f6b1d0ffcc989..9f59e6b08738d 100644 --- a/common/chat.h +++ b/common/chat.h @@ -70,7 +70,7 @@ struct common_chat_msg { }; struct common_chat_msg_diff { - // std::string reasoning_content_delta; + std::string reasoning_content_delta; std::string content_delta; size_t tool_call_index = std::string::npos; common_chat_tool_call tool_call_delta; diff --git a/common/cmake/build-info-gen-cpp.cmake b/common/cmake/build-info-gen-cpp.cmake deleted file mode 100644 index fbc92b52cc4fe..0000000000000 --- a/common/cmake/build-info-gen-cpp.cmake +++ /dev/null @@ -1,24 +0,0 @@ -include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) - -set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in") -set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp") - -# Only write the build info if it changed -if(EXISTS ${OUTPUT_FILE}) - file(READ ${OUTPUT_FILE} CONTENTS) - string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_COMMIT ${CMAKE_MATCH_1}) - string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_COMPILER ${CMAKE_MATCH_1}) - string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_TARGET ${CMAKE_MATCH_1}) - if ( - NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR - NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR - NOT OLD_TARGET STREQUAL BUILD_TARGET - ) - configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) - endif() -else() - configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) -endif() diff --git a/common/common.cpp b/common/common.cpp index 4cc40ed8b37a4..e4e71ad13fb59 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_ std::string regex_escape(const std::string & s) { static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]"); - return std::regex_replace(s, special_chars, "\\$0"); + return std::regex_replace(s, special_chars, "\\$&"); } std::string string_join(const std::vector & values, const std::string & separator) { @@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) { // disable C++17 deprecation warning for std::codecvt_utf8 # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wdeprecated-declarations" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wdeprecated-declarations" #endif + std::wstring_convert, char32_t> converter; #if defined(__clang__) # pragma clang diagnostic pop +#elif defined(__GNUC__) +# pragma GCC diagnostic pop #endif filename_utf32 = converter.from_bytes(filename); @@ -767,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) { return true; } +#include + + // returns true if successful, false otherwise bool fs_create_directory_with_parents(const std::string & path) { #ifdef _WIN32 @@ -784,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) { // process path from front to back, procedurally creating directories while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { const std::wstring subpath = wpath.substr(0, pos_slash); - const wchar_t * test = subpath.c_str(); - const bool success = CreateDirectoryW(test, NULL); + pos_slash += 1; + + // skip the drive letter, in some systems it can return an access denied error + if (subpath.length() == 2 && subpath[1] == ':') { + continue; + } + + const bool success = CreateDirectoryW(subpath.c_str(), NULL); + if (!success) { const DWORD error = GetLastError(); @@ -800,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) { return false; } } - - pos_slash += 1; } return true; @@ -897,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) { const llama_vocab * vocab = llama_model_get_vocab(model); - if (params.reranking) { - bool ok = true; - - if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__); - ok = false; - } - - bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; - bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL; - - if (!has_eos && !has_sep) { - LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__); - ok = false; - } else if (!has_eos) { - LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__); - } else if (!has_sep) { - LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__); - ok = false; - } - - if (!ok) { - llama_model_free(model); - - return iparams; - } - } - auto cparams = common_context_params_to_llama(params); llama_context * lctx = llama_init_from_model(model, cparams); @@ -934,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } - if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) { + if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) { LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__); params.ctx_shift = false; } @@ -966,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) { } } + if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) { + bool ok = true; + + if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) { + LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__); + ok = false; + } + + bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; + bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL; + + if (!has_eos && !has_sep) { + LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__); + ok = false; + } else if (!has_eos) { + LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__); + } else if (!has_sep) { + LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__); + ok = false; + } + + if (!ok) { + llama_free(lctx); + llama_model_free(model); + + return iparams; + } + } + // load and optionally apply lora adapters for (auto & la : params.lora_adapters) { llama_adapter_lora_ptr lora; @@ -1041,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_decoder(model)) { llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); } - llama_kv_self_clear(lctx); + llama_memory_clear(llama_get_memory(lctx), true); llama_synchronize(lctx); llama_perf_context_reset(lctx); llama_set_warmup(lctx, false); @@ -1143,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.op_offload = !params.no_op_offload; cparams.swa_full = params.swa_full; - if (params.reranking) { - cparams.embeddings = true; - cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; - } - cparams.type_k = params.cache_type_k; cparams.type_v = params.cache_type_v; @@ -1280,6 +1290,9 @@ std::vector common_tokenize( int n_tokens = text.length() + 2 * add_special; std::vector result(n_tokens); n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + if (n_tokens == std::numeric_limits::min()) { + throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit"); + } if (n_tokens < 0) { result.resize(-n_tokens); int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); diff --git a/common/common.h b/common/common.h index cee1e3039cf9e..e08a59eae7543 100644 --- a/common/common.h +++ b/common/common.h @@ -199,6 +199,9 @@ struct common_params_speculative { float p_split = 0.1f; // speculative decoding split probability float p_min = 0.75f; // minimum speculative decoding probability (greedy) + ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K + ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V + struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -215,7 +218,8 @@ struct common_params_vocoder { enum common_reasoning_format { COMMON_REASONING_FORMAT_NONE, - COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content` + COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in tags in stream mode + COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. }; struct common_params { @@ -354,7 +358,7 @@ struct common_params { int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix std::string embd_sep = "\n"; // separator of embeddings - bool reranking = false; // enable reranking support on server + std::string cls_sep = "\t"; // separator of classification sequences // server params int32_t port = 8080; // server listens on this network port diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index d38a74f95c213..637891f50699c 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items return result; } -/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */ -class string_view { - const std::string & _str; - const size_t _start; - const size_t _end; -public: - string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {} - - size_t size() const { - return _end - _start; - } - - size_t length() const { - return size(); - } - - operator std::string() const { - return str(); - } - - std::string str() const { - return _str.substr(_start, _end - _start); - } - - string_view substr(size_t pos, size_t len = std::string::npos) const { - return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len); - } - - char operator[](size_t pos) const { - auto index = _start + pos; - if (index >= _end) { - throw std::out_of_range("string_view index out of range"); - } - return _str[_start + pos]; - } - - bool operator==(const string_view & other) const { - std::string this_str = *this; - std::string other_str = other; - return this_str == other_str; - } -}; - static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) { auto has_min = min_value != std::numeric_limits::min(); auto has_max = max_value != std::numeric_limits::max(); @@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream & } out << "}"; }; - std::function uniform_range = - [&](const string_view & from, const string_view & to) { + std::function uniform_range = + [&](const std::string_view & from, const std::string_view & to) { size_t i = 0; while (i < from.length() && i < to.length() && from[i] == to[i]) { i++; } if (i > 0) { - out << "\"" << from.substr(0, i).str() << "\""; + out << "\"" << from.substr(0, i) << "\""; } if (i < from.length() && i < to.length()) { if (i > 0) { diff --git a/common/speculative.cpp b/common/speculative.cpp index ccad70fa9ed85..843bd1ddbdbd7 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft( auto & smpl = spec->smpl; auto & prompt = spec->prompt; + auto * mem = llama_get_memory(ctx); + int reuse_i = 0; int reuse_n = 0; @@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft( result.reserve(params.n_draft); if (reuse_n == 0) { - llama_kv_self_clear(ctx); + llama_memory_clear(mem, false); prompt.clear(); } else { @@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft( } if (reuse_i > 0) { - llama_kv_self_seq_rm (ctx, 0, 0, reuse_i); - llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i); + llama_memory_seq_rm (mem, 0, 0, reuse_i); + llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i); prompt.erase(prompt.begin(), prompt.begin() + reuse_i); } if (reuse_n < (int) prompt.size()) { - llama_kv_self_seq_rm (ctx, 0, reuse_n, -1); + llama_memory_seq_rm (mem, 0, reuse_n, -1); prompt.erase(prompt.begin() + reuse_n, prompt.end()); } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ab0f0e0ea087e..4f2339a02a13c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -310,6 +310,8 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.POSNET_NORM2, gguf.MODEL_TENSOR.V_ENC_EMBD_POS, gguf.MODEL_TENSOR.A_ENC_EMBD_POS, + gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, + gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, ) ) or not new_name.endswith(".weight") @@ -320,7 +322,11 @@ def prepare_tensors(self): self.match_model_tensor_name(new_name, key, bid) for key in ( gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, gguf.MODEL_TENSOR.OUTPUT, + gguf.MODEL_TENSOR.ALTUP_ROUTER, + gguf.MODEL_TENSOR.LAUREL_L, + gguf.MODEL_TENSOR.LAUREL_R, ) ): if self.ftype in ( @@ -519,7 +525,7 @@ def prepare_metadata(self, vocab_only: bool): def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None: + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") @@ -921,13 +927,16 @@ def _create_vocab_sentencepiece(self): tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.find_hparam([ + "vocab_size_per_layer_input", # gemma3n + "vocab_size", + ], optional=True) or tokenizer.vocab_size() tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - for token_id in range(tokenizer.vocab_size()): + for token_id in range(vocab_size): piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) @@ -942,6 +951,10 @@ def _create_vocab_sentencepiece(self): elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE + if token_id >= vocab_size: + logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') + break + tokens[token_id] = text scores[token_id] = score toktypes[token_id] = toktype @@ -1898,9 +1911,7 @@ def set_gguf_parameters(self): hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: + if (rope_dim := hparams.get("head_dim")) is None: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) @@ -1982,7 +1993,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) factor = rope_scaling.get("factor", 8.0) @@ -2017,6 +2029,20 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("ArceeForCausalLM") +class ArceeModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.ARCEE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + @ModelBase.register( "LlavaForConditionalGeneration", # pixtral "Mistral3ForConditionalGeneration", # mistral small 3.1 @@ -2132,7 +2158,6 @@ def __init__(self, *args, **kwargs): def set_vocab(self): self._set_vocab_gpt2() - self.gguf_writer.add_add_bos_token(True) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -2181,7 +2206,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name += ".weight" if "multi_modal_projector.linear_1" in name: # despite the name with number postfix, this is a single fully connected layer - return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)] + return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)] return [(self.map_tensor_name(name), data_torch)] return [] @@ -2304,9 +2329,7 @@ def set_gguf_parameters(self): hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: + if (rope_dim := hparams.get("head_dim")) is None: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) @@ -2346,7 +2369,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) factor = rope_scaling.get("factor", 8.0) @@ -3664,9 +3688,7 @@ def set_gguf_parameters(self): hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: + if (rope_dim := hparams.get("head_dim")) is None: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) @@ -3709,8 +3731,7 @@ def set_gguf_parameters(self): self._try_set_pooling_type() if self.cls_out_labels: - key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch]) - self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())]) + self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())]) def set_vocab(self): tokens, toktypes, tokpre = self.get_vocab_base() @@ -3814,7 +3835,7 @@ def _xlmroberta_set_vocab(self) -> None: remove_whitespaces = tokenizer.clean_up_tokenization_spaces precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) - vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size) + vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) else: sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) @@ -3827,7 +3848,7 @@ def _xlmroberta_set_vocab(self) -> None: tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size @@ -3857,33 +3878,26 @@ def _xlmroberta_set_vocab(self) -> None: unk_token = tokenizer_config_json.get("unk_token") unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) - for token_id in range(vocab_size): + for token_id in range(tokenizer.vocab_size): piece = tokenizer._convert_id_to_token(token_id) - text = piece.encode("utf-8") - score = tokenizer_json["model"]["vocab"][token_id][1] - - toktype = SentencePieceTokenTypes.NORMAL - if token_id == unk_token_id: - toktype = SentencePieceTokenTypes.UNKNOWN - elif token_id in tokenizer.all_special_ids: - toktype = SentencePieceTokenTypes.CONTROL - elif token_id in added_vocab.values(): - toktype = SentencePieceTokenTypes.USER_DEFINED - # No reliable way to detect this, but jina doesn't have any - # elif tokenizer.IsByte(token_id): - # toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) + if (piece := tokenizer._convert_id_to_token(token_id)) is not None: + text = piece.encode("utf-8") + score = tokenizer_json["model"]["vocab"][token_id][1] + + toktype = SentencePieceTokenTypes.NORMAL + if token_id == unk_token_id: + toktype = SentencePieceTokenTypes.UNKNOWN + elif token_id in tokenizer.all_special_ids: + toktype = SentencePieceTokenTypes.CONTROL + elif token_id in added_vocab.values(): + toktype = SentencePieceTokenTypes.USER_DEFINED + # No reliable way to detect this, but jina doesn't have any + # elif tokenizer.IsByte(token_id): + # toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype if isinstance(tokenizer, SentencePieceProcessor): # realign tokens (see HF tokenizer code) @@ -3896,6 +3910,12 @@ def _xlmroberta_set_vocab(self) -> None: SentencePieceTokenTypes.UNKNOWN, ] + toktypes[3:-1] + if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: + # Add mask token missing from sentencepiece.bpe.model + tokens[250001] = b'' + scores[250001] = 0.0 + toktypes[250001] = SentencePieceTokenTypes.CONTROL + self.gguf_writer.add_tokenizer_model("t5") self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) @@ -3910,9 +3930,6 @@ def _xlmroberta_set_vocab(self) -> None: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification") class DistilBertModel(BertModel): @@ -3954,8 +3971,6 @@ def set_vocab(self): bpe_tok_path = self.dir_model / "tokenizer.json" if bpe_tok_path.exists(): self._set_vocab_gpt2() - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings @@ -4061,6 +4076,34 @@ def _is_tokenizer_xlmroberta(self) -> bool: raise ValueError(f"unknown tokenizer: {toktyp}") +@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification") +class NeoBert(BertModel): + model_arch = gguf.MODEL_ARCH.NEO_BERT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # NeoBERT uses 2/3 of the intermediate size as feed forward length + self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3)) + self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + + self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use + + def modify_tensors(self, data_torch, name, bid): + if name.startswith("decoder."): + return [] + + if name.startswith("model."): + name = name[6:] + + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -4187,6 +4230,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") class Gemma3Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA3 + norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value def set_vocab(self): self._set_vocab_sentencepiece() @@ -4208,9 +4252,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers - # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3 + # attn_logit_softcapping is removed in Gemma3 assert hparams.get("attn_logit_softcapping") is None - assert hparams.get("final_logit_softcapping") is None self.gguf_writer.add_sliding_window(hparams["sliding_window"]) self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) if hparams.get("rope_scaling") is not None: @@ -4222,7 +4265,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if name.startswith("language_model."): + if "language_model." in name: name = name.replace("language_model.", "") elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ @@ -4237,8 +4280,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # ref code in Gemma3RMSNorm # output = output * (1.0 + self.weight.float()) + # note: this is not the case on gemma3n if name.endswith("norm.weight"): - data_torch = data_torch + 1 + data_torch = data_torch + self.norm_shift return [(self.map_tensor_name(name), data_torch)] @@ -4295,6 +4339,104 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors +@ModelBase.register("Gemma3nForConditionalGeneration") +class Gemma3NModel(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA3N + norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code + + _altup_proj: list[Tensor] = [] + _altup_unembd: list[Tensor] = [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs" + self._altup_proj = [ + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + ] + self._altup_unembd = [ + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + ] + + def set_vocab(self): + with open(self.dir_model / "chat_template.jinja") as f: + # quick hack to make sure chat template is added + self.gguf_writer.add_chat_template(f.read()) + super().set_vocab() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"]) + self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"]) + self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"]) + self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"]) + + activation_sparsity_scale = [] + for s in self.hparams["activation_sparsity_pattern"]: + normal_dist = torch.distributions.normal.Normal(0, 1) + std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32)) + activation_sparsity_scale.append(std_multiplier.item()) + self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale) + + sliding_window_pattern = [] + for t in self.hparams["layer_types"]: + sliding_window_pattern.append(t == "sliding_attention") + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + + def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None: + has_all = all(m.numel() > 0 for m in matrices) + if not has_all: + return None + else: + return torch.stack(matrices, dim=0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith("_scale"): + name = name + ".weight" + + # TODO: implement self.prediction_coefs.weight.clamp_(...) + + if "language_model." not in name: + return [] # skip non-language model tensors + + if "altup_unembed_projections" in name: + data_torch = data_torch.to(device="cpu") + if ".0." in name: + self._altup_unembd[0] = data_torch + elif ".1." in name: + self._altup_unembd[1] = data_torch + elif ".2." in name: + self._altup_unembd[2] = data_torch + else: + raise ValueError(f"Unknown name: {name}") + out = self._stack_matrices(self._altup_unembd) + if out is not None: + return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)] + else: + return [] + + if "altup_projections" in name: + data_torch = data_torch.to(device="cpu") + if ".0." in name: + self._altup_proj[0] = data_torch + elif ".1." in name: + self._altup_proj[1] = data_torch + elif ".2." in name: + self._altup_proj[2] = data_torch + else: + raise ValueError(f"Unknown name: {name}") + out = self._stack_matrices(self._altup_proj) + if out is not None: + return [(self.map_tensor_name("model.altup_projections.weight"), out)] + else: + return [] + + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Starcoder2ForCausalLM") class StarCoder2Model(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER2 @@ -4800,25 +4942,6 @@ def prepare_tensors(self): class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.intermediate_size = self.hparams["intermediate_size"] - - def get_tensors(self): - for name, data in super().get_tensors(): - if 'gated_layer' in name: - d1 = data[:self.intermediate_size, :] - name1 = name.replace('gated_layers', 'gated_layers_w') - name1 = name1.replace('up_gated_layer', 'gated_layers_v') - d2 = data[self.intermediate_size:, :] - name2 = name.replace('gated_layers', 'gated_layers_v') - name2 = name2.replace('up_gated_layer', 'gated_layers_w') - yield name1, d1 - yield name2, d2 - continue - - yield name, data - def set_vocab(self): tokenizer_class = 'BertTokenizer' with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: @@ -4831,16 +4954,6 @@ def set_vocab(self): self.gguf_writer.add_token_type_count(2) else: raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "bert.", remove the prefix - # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - if name.startswith("bert."): - name = name[5:] - - return super().modify_tensors(data_torch, name, bid) @ModelBase.register("OpenELMForCausalLM") @@ -5082,9 +5195,7 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: + if (rope_dim := hparams.get("head_dim")) is None: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) @@ -5288,6 +5399,34 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("Dots1ForCausalLM") +class Dots1Model(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.DOTS1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hparams["num_experts"] = self.hparams["n_routed_experts"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) + self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) + + if self.hparams["scoring_func"] == "noaux_tc": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + else: + raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + if "shared_experts" in name: + return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("PLMForCausalLM") class PLMModel(TextModel): model_arch = gguf.MODEL_ARCH.PLM @@ -5416,9 +5555,6 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_add_eos_token(True) - def set_gguf_parameters(self): if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: logger.warning("Couldn't find context length in config.json, assuming default value of 512") @@ -5556,9 +5692,6 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_add_eos_token(True) - def set_gguf_parameters(self): if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: logger.warning("Couldn't find context length in config.json, assuming default value of 512") @@ -5946,7 +6079,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", '').lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) - dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) factor = rope_scaling.get("factor", 8.0) @@ -6058,7 +6192,8 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"] + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) rope_scaling = self.hparams.get("rope_scaling") or {} @@ -6090,7 +6225,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") n_embd = self.hparams["hidden_size"] - head_dim = self.hparams.get("head_dim") or n_embd // n_head + if (head_dim := self.hparams.get("head_dim")) is None: + head_dim = n_embd // n_head output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) @@ -6351,8 +6487,8 @@ def parse_args() -> argparse.Namespace: help="model is executed on big endian machine", ) parser.add_argument( - "model", type=Path, - help="directory containing model file", + "model", type=str, + help="directory containing model file or huggingface repository ID (if --remote)", nargs="?", ) parser.add_argument( @@ -6455,18 +6591,20 @@ def main() -> None: else: logging.basicConfig(level=logging.INFO) - dir_model = args.model - if args.remote: + hf_repo_id = args.model from huggingface_hub import snapshot_download local_dir = snapshot_download( - repo_id=str(dir_model), + repo_id=hf_repo_id, allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]) dir_model = Path(local_dir) logger.info(f"Downloaded config and tokenizer to {local_dir}") + else: + hf_repo_id = None + dir_model = Path(args.model) if not dir_model.is_dir(): - logger.error(f'Error: {args.model} is not a directory') + logger.error(f'Error: {dir_model} is not a directory') sys.exit(1) ftype_map: dict[str, gguf.LlamaFileType] = { @@ -6486,9 +6624,9 @@ def main() -> None: if args.outfile is not None: fname_out = args.outfile - elif args.remote: + elif hf_repo_id: # if remote, use the model ID as the output file name - fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf") + fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf") else: fname_out = dir_model @@ -6517,7 +6655,7 @@ def main() -> None: split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=str(args.model) if args.remote else None) + remote_hf_model_id=hf_repo_id) if args.vocab_only: logger.info("Exporting model vocab...") diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index a5ba617ca7bab..2b001f09abe45 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -8,6 +8,7 @@ - [DataType Supports](#datatype-supports) - [Docker](#docker) - [Linux](#linux) + - [Environment variable setup](#environment-variable-setup) - [TODO](#todo) @@ -290,5 +291,24 @@ Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request. +## Environment variable setup + +### GGML_CANN_ASYNC_MODE + +Enables asynchronous operator submission. Disabled by default. + +### GGML_CANN_MEM_POOL + +Specifies the memory pool management strategy: + +- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool. + +- prio: Employs a priority queue-based memory pool management. +- leg: Uses a fixed-size buffer pool. + +### GGML_CANN_DISABLE_BUF_POOL_CLEAN + +Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies. + ## TODO - Support more models and data types. diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 249e73451e66b..6e9b88935da97 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -757,7 +757,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | Name | Value | Function | |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | -| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase | +| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) | | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. | | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | diff --git a/docs/build-s390x.md b/docs/build-s390x.md new file mode 100644 index 0000000000000..4c9ebb271cee2 --- /dev/null +++ b/docs/build-s390x.md @@ -0,0 +1,246 @@ +> [!IMPORTANT] +> This build documentation is specific only to IBM Z & LinuxONE mainframes (s390x). You can find the build documentation for other architectures: [build.md](build.md). + +# Build llama.cpp locally (for s390x) + +The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h). + +The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. + +**To get the code:** + +```bash +git clone https://github.com/ggml-org/llama.cpp +cd llama.cpp +``` + +## CPU Build with BLAS + +Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. Make sure to have OpenBLAS installed in your environment. + +```bash +cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS + +cmake --build build --config Release -j $(nproc) +``` + +**Notes**: + +- For faster repeated compilation, install [ccache](https://ccache.dev/) +- By default, VXE/VXE2 is enabled. To disable it (not recommended): + + ```bash + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS \ + -DGGML_VXE=OFF + + cmake --build build --config Release -j $(nproc) + ``` + +- By default, NNPA is enabled when available. To disable it (not recommended): + + ```bash + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS \ + -DGGML_NNPA=OFF + + cmake --build build --config Release -j $(nproc) + ``` + +- For debug builds: + + ```bash + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=Debug \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS + cmake --build build --config Debug -j $(nproc) + ``` + +- For static builds, add `-DBUILD_SHARED_LIBS=OFF`: + + ```bash + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS \ + -DBUILD_SHARED_LIBS=OFF + + cmake --build build --config Release -j $(nproc) + ``` + +## Getting GGUF Models + +All models need to be converted to Big-Endian. You can achieve this in three cases: + +1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)** + + ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff) + + You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08). + + These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system. + +2. **Convert safetensors model to GGUF Big-Endian directly (recommended)** + + ![File Type - safetensors](https://img.shields.io/badge/File_Type-safetensors-da1e28) + + The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case. + + ```bash + python3 convert_hf_to_gguf.py \ + --outfile model-name-be.f16.gguf \ + --outtype f16 \ + --bigendian \ + model-directory/ + ``` + + For example, + + ```bash + python3 convert_hf_to_gguf.py \ + --outfile granite-3.3-2b-instruct-be.f16.gguf \ + --outtype f16 \ + --bigendian \ + granite-3.3-2b-instruct/ + ``` + +3. **Convert existing GGUF Little-Endian model to Big-Endian** + + ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff) + + The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case. + + ```bash + python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG + ``` + + For example, + + ```bash + python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG + mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf + ``` + + **Notes:** + + - The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2. + +## IBM Accelerators + +### 1. SIMD Acceleration + +Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation. + +### 2. NNPA Vector Intrinsics Acceleration + +Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation. + +### 3. zDNN Accelerator + +_Only available in IBM z16 or later system. No direction at the moment._ + +### 4. Spyre Accelerator + +_No direction at the moment._ + +## Performance Tuning + +### 1. Virtualization Setup + +It is strongly recommended to use only LPAR (Type-1) virtualization to get the most performance. + +Note: Type-2 virtualization is not supported at the moment, while you can get it running, the performance will not be the best. + +### 2. IFL (Core) Count + +It is recommended to allocate a minimum of 8 shared IFLs assigned to the LPAR. Increasing the IFL count past 8 shared IFLs will only improve Prompt Processing performance but not Token Generation. + +Note: IFL count does not equate to vCPU count. + +### 3. SMT vs NOSMT (Simultaneous Multithreading) + +It is strongly recommended to disable SMT via the kernel boot parameters as it negatively affects performance. Please refer to your Linux distribution's guide on disabling SMT via kernel boot parameters. + +### 4. BLAS vs NOBLAS + +IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS. + +## Frequently Asked Questions (FAQ) + +1. I'm getting the following error message while trying to load a model: `gguf_init_from_file_impl: failed to load model: this GGUF file version 50331648 is extremely large, is there a mismatch between the host and model endianness?` + + Answer: Please ensure that the model you have downloaded/converted is GGUFv3 Big-Endian. These models are usually denoted with the `-be` suffix, i.e., `granite-3.3-2b-instruct-be.F16.gguf`. + + You may refer to the [Getting GGUF Models](#getting-gguf-models) section to manually convert a `safetensors` model to `GGUF` Big Endian. + +2. I'm getting extremely poor performance when running inference on a model + + Answer: Please refer to the [Appendix B: SIMD Support Matrix](#appendix-b-simd-support-matrix) to check if your model quantization is supported by SIMD acceleration. + +3. I'm building on IBM z17 and getting the following error messages: `invalid switch -march=z17` + + Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue. + +## Getting Help on IBM Z & LinuxONE + +1. **Bugs, Feature Requests** + + Please file an issue in llama.cpp and ensure that the title contains "s390x". + +2. **Other Questions** + + Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com). + +## Appendix A: Hardware Support Matrix + +| | Support | Minimum Compiler Version | +| ------- | ------- | ------------------------ | +| IBM z15 | ✅ | | +| IBM z16 | ✅ | | +| IBM z17 | ✅ | GCC 15.1.0 | + +- ✅ - supported and verified to run as intended +- 🚫 - unsupported, we are unlikely able to provide support + +## Appendix B: SIMD Support Matrix + +| | VX/VXE/VXE2 | NNPA | zDNN | Spyre | +| ---------- | ----------- | ---- | ---- | ----- | +| FP32 | ✅ | ✅ | ⓠ| ⓠ| +| FP16 | ✅ | ✅ | ⓠ| ⓠ| +| BF16 | 🚫 | 🚫 | ⓠ| ⓠ| +| Q4_0 | ✅ | ✅ | ⓠ| ⓠ| +| Q4_1 | ✅ | ✅ | ⓠ| ⓠ| +| Q5_0 | 🚫 | 🚫 | ⓠ| ⓠ| +| Q5_1 | 🚫 | 🚫 | ⓠ| ⓠ| +| Q8_0 | ✅ | ✅ | ⓠ| ⓠ| +| Q2_K | 🚫 | 🚫 | ⓠ| ⓠ| +| Q3_K | ✅ | ✅ | ⓠ| ⓠ| +| Q4_K | ✅ | ✅ | ⓠ| ⓠ| +| Q5_K | ✅ | ✅ | ⓠ| ⓠ| +| Q6_K | ✅ | ✅ | ⓠ| ⓠ| +| TQ1_0 | 🚫 | 🚫 | ⓠ| ⓠ| +| TQ2_0 | 🚫 | 🚫 | ⓠ| ⓠ| +| IQ2_XXS | 🚫 | 🚫 | ⓠ| ⓠ| +| IQ2_XS | 🚫 | 🚫 | ⓠ| ⓠ| +| IQ2_S | 🚫 | 🚫 | ⓠ| ⓠ| +| IQ3_XXS | 🚫 | 🚫 | ⓠ| ⓠ| +| IQ3_S | 🚫 | 🚫 | ⓠ| ⓠ| +| IQ1_S | 🚫 | 🚫 | ⓠ| ⓠ| +| IQ1_M | 🚫 | 🚫 | ⓠ| ⓠ| +| IQ4_NL | ✅ | ✅ | ⓠ| ⓠ| +| IQ4_XS | ✅ | ✅ | ⓠ| ⓠ| +| FP32->FP16 | 🚫 | ✅ | ⓠ| ⓠ| +| FP16->FP32 | 🚫 | ✅ | ⓠ| ⓠ| + +- ✅ - acceleration available +- 🚫 - acceleration unavailable, will still run using scalar implementation +- ⓠ- acceleration unknown, please contribute if you can test it yourself diff --git a/docs/build.md b/docs/build.md index 32717a793ffad..2e0b5d970c91a 100644 --- a/docs/build.md +++ b/docs/build.md @@ -1,5 +1,9 @@ # Build llama.cpp locally +The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h). + +The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. + **To get the Code:** ```bash @@ -553,6 +557,10 @@ ninja To read documentation for how to build on Android, [click here](./android.md) +## IBM Z & LinuxONE + +To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md) + ## Notes about GPU-accelerated backends The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. diff --git a/docs/function-calling.md b/docs/function-calling.md index fd3db9bd16a92..37eacaf3100c1 100644 --- a/docs/function-calling.md +++ b/docs/function-calling.md @@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2 - Functionary v3.1 / v3.2 - Hermes 2/3, Qwen 2.5 - - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034) + - Qwen 2.5 Coder - Mistral Nemo - Firefunction v2 - Command R7B diff --git a/docs/install.md b/docs/install.md index 4971c18281cc9..7200bf9b7b91d 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,28 +1,42 @@ # Install pre-built version of llama.cpp -## Homebrew +| Install via | Windows | Mac | Linux | +|-------------|---------|-----|-------| +| Winget | ✅ | | | +| Homebrew | | ✅ | ✅ | +| MacPorts | | ✅ | | +| Nix | | ✅ | ✅ | -On Mac and Linux, the homebrew package manager can be used via +## Winget (Windows) + +```sh +winget install llama.cpp +``` + +The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188 + +## Homebrew (Mac and Linux) ```sh brew install llama.cpp ``` + The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668 -## MacPorts +## MacPorts (Mac) ```sh sudo port install llama.cpp ``` -see also: https://ports.macports.org/port/llama.cpp/details/ -## Nix +See also: https://ports.macports.org/port/llama.cpp/details/ -On Mac and Linux, the Nix package manager can be used via +## Nix (Mac and Linux) ```sh nix profile install nixpkgs#llama-cpp ``` + For flake enabled installs. Or @@ -34,13 +48,3 @@ nix-env --file '' --install --attr llama-cpp For non-flake enabled installs. This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164). - -## Flox - -On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via - -```sh -flox install llama-cpp -``` - -Flox follows the nixpkgs build of llama.cpp. diff --git a/docs/multimodal.md b/docs/multimodal.md index e849c2a0b8ba1..edbd081df7969 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -107,3 +107,7 @@ NOTE: some models may require large context window, for example: `-c 8192` (tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF (tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF ``` + +## Finding more models: + +GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 514989e340e2c..fd90bbec5f751 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 { } for i in 1 ..< n_parallel { - llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) + llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens) } if n_parallel > 1 { diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 71f700877a3b9..0ec2999a0c8e9 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -37,7 +37,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_self_clear(ctx); + llama_memory_clear(llama_get_memory(ctx), true); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); @@ -133,10 +133,36 @@ int main(int argc, char ** argv) { // max batch size const uint64_t n_batch = params.n_batch; + // get added sep and eos token, if any + const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : ""; + const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : ""; + // tokenize the prompts and trim std::vector> inputs; for (const auto & prompt : prompts) { - auto inp = common_tokenize(ctx, prompt, true, true); + std::vector inp; + + // split classification pairs and insert expected separator tokens + if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) { + std::vector pairs = split_lines(prompt, params.cls_sep); + std::string final_prompt; + + for (size_t i = 0; i < pairs.size(); i++) { + final_prompt += pairs[i]; + if (i != pairs.size() - 1) { + if (!added_eos_token.empty()) { + final_prompt += added_eos_token; + } + if (!added_sep_token.empty()) { + final_prompt += added_sep_token; + } + } + } + + inp = common_tokenize(ctx, final_prompt, true, true); + } else { + inp = common_tokenize(ctx, prompt, true, true); + } if (inp.size() > n_batch) { LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); @@ -145,11 +171,11 @@ int main(int argc, char ** argv) { inputs.push_back(inp); } - // check if the last token is SEP + // check if the last token is SEP/EOS // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' for (auto & inp : inputs) { - if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) { - LOG_WRN("%s: last token in the prompt is not SEP\n", __func__); + if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) { + LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__); LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); } } @@ -236,9 +262,24 @@ int main(int argc, char ** argv) { LOG("\n"); } } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) { + const uint32_t n_cls_out = llama_model_n_cls_out(model); + std::vector cls_out_labels; + + for (uint32_t i = 0; i < n_cls_out; i++) { + const char * label = llama_model_cls_label(model, i); + const std::string label_i(label == nullptr ? "" : label); + cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i); + } + for (int j = 0; j < n_embd_count; j++) { - // NOTE: if you change this log - update the tests in ci/run.sh - LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]); + for (uint32_t i = 0; i < n_cls_out; i++) { + // NOTE: if you change this log - update the tests in ci/run.sh + if (n_cls_out == 1) { + LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]); + } else { + LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str()); + } + } } } else { // print the first part of the embeddings or for a single prompt, the full embedding diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 539bc4d6027fb..bdab052c3390f 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -41,12 +41,11 @@ static std::vector> encode(llama_context * ctx, const std::ve // add input to batch (this increments n_tokens) for (int32_t j = 0; j < n_toks; j++) { - common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); + common_batch_add(batch, inputs[j], j, { 0 }, true); } // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_self_clear(ctx); - llama_set_embeddings(ctx, true); + llama_memory_clear(llama_get_memory(ctx), true); llama_set_causal_attn(ctx, false); // run model @@ -102,8 +101,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std llama_token eos_token = llama_vocab_eos(vocab); - llama_kv_self_clear(ctx); - llama_set_embeddings(ctx, false); + llama_memory_clear(llama_get_memory(ctx), true); llama_set_causal_attn(ctx, true); llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); @@ -166,6 +164,8 @@ int main(int argc, char * argv[]) { llama_model_params mparams = common_model_params_to_llama(params); llama_context_params cparams = common_context_params_to_llama(params); + cparams.embeddings = true; + llama_backend_init(); llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); @@ -213,6 +213,8 @@ int main(int argc, char * argv[]) { std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1); } + llama_set_embeddings(ctx, false); + // ### Generation ### // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction { diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 9654cd53cf8d5..711ddc5d19587 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( } batch->logits[batch->n_tokens - 1] = true; - llama_kv_self_clear(context); + llama_memory_clear(llama_get_memory(context), false); const auto t_pp_start = ggml_time_us(); if (llama_decode(context, *batch) != 0) { @@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( LOGi("Benchmark text generation (tg)"); - llama_kv_self_clear(context); + llama_memory_clear(llama_get_memory(context), false); const auto t_tg_start = ggml_time_us(); for (i = 0; i < tg; i++) { @@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( const auto t_tg_end = ggml_time_us(); - llama_kv_self_clear(context); + llama_memory_clear(llama_get_memory(context), false); const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0; const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0; @@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { - llama_kv_self_clear(reinterpret_cast(context)); + llama_memory_clear(llama_get_memory(reinterpret_cast(context)), true); } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index f6e31abc93c09..dc2bafc88b175 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -210,7 +210,7 @@ actor LlamaContext { } batch.logits[Int(batch.n_tokens) - 1] = 1 // true - llama_kv_self_clear(context) + llama_memory_clear(llama_get_memory(context), false) let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000; @@ -223,7 +223,7 @@ actor LlamaContext { // bench text generation - llama_kv_self_clear(context) + llama_memory_clear(llama_get_memory(context), false) let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000; @@ -242,7 +242,7 @@ actor LlamaContext { let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000; - llama_kv_self_clear(context) + llama_memory_clear(llama_get_memory(context), false) let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0 let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0 @@ -292,7 +292,7 @@ actor LlamaContext { func clear() { tokens_list.removeAll() temporary_invalid_cchars.removeAll() - llama_kv_self_clear(context) + llama_memory_clear(llama_get_memory(context), true) } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 5f8620973f40e..1e26d8221b86b 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -60,6 +60,8 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); + auto * mem = llama_get_memory(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); // Tokenize the prompt @@ -94,7 +96,7 @@ int main(int argc, char ** argv) { llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_self_seq_cp(ctx, 0, s, -1, -1); + llama_memory_seq_cp(mem, 0, s, -1, -1); } const auto t_enc_end = ggml_time_us(); @@ -427,17 +429,17 @@ int main(int argc, char ** argv) { // KV cache management // if no verification token matched, we simply remove all cells from this batch -> no fragmentation - llama_kv_self_seq_rm(ctx, -1, n_past, -1); + llama_memory_seq_rm(mem, -1, n_past, -1); if (seq_id_best != 0) { // if a verification token matched, we keep the best sequence and remove the rest // this leads to some KV cache fragmentation - llama_kv_self_seq_keep(ctx, seq_id_best); - llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1); - llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1); + llama_memory_seq_keep(mem, seq_id_best); + llama_memory_seq_cp (mem, seq_id_best, 0, -1, -1); + llama_memory_seq_rm (mem, seq_id_best, -1, -1); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_self_seq_cp(ctx, 0, s, -1, -1); + llama_memory_seq_cp(mem, 0, s, -1, -1); } } } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 2ee502939d554..2bfa26b55f0a6 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -181,7 +181,7 @@ int main(int argc, char ** argv){ // KV cache management // clean the cache of draft tokens that weren't accepted - llama_kv_self_seq_rm(ctx, 0, n_past, -1); + llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1); common_batch_clear(batch_tgt); common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index d7b269df0dea2..d53e089a4cbc2 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -158,7 +158,7 @@ int main(int argc, char ** argv) { common_params params; params.n_predict = 128; - params.n_junk = 0; + params.n_junk = 1; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { return 1; @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { const bool is_sp_shared = params.is_pp_shared; // extra text to insert in each client's prompt in order to make it larger - const int32_t n_junk = params.n_junk; + const int32_t n_junk = std::max(1, params.n_junk); // init llama.cpp llama_backend_init(); @@ -194,6 +194,8 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); + auto * mem = llama_get_memory(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); // load the prompts from an external file if there are any @@ -259,7 +261,7 @@ int main(int argc, char ** argv) { // assign the system KV cache to all parallel sequences for (int32_t i = 1; i <= n_clients; ++i) { - llama_kv_self_seq_cp(ctx, 0, i, -1, -1); + llama_memory_seq_cp(mem, 0, i, -1, -1); } LOG_INF("\n"); @@ -286,9 +288,9 @@ int main(int argc, char ** argv) { if (batch.n_tokens == 0) { // all sequences have ended - clear the entire KV cache for (int i = 1; i <= n_clients; ++i) { - llama_kv_self_seq_rm(ctx, i, -1, -1); + llama_memory_seq_rm(mem, i, -1, -1); // but keep the system prompt - llama_kv_self_seq_cp(ctx, 0, i, -1, -1); + llama_memory_seq_cp(mem, 0, i, -1, -1); } LOG_INF("%s: clearing the KV cache\n", __func__); @@ -447,8 +449,8 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1); - llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1); + llama_memory_seq_rm(mem, client.id + 1, -1, -1); + llama_memory_seq_cp(mem, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 5ac881b45e268..8a4faa383bf32 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -126,6 +126,8 @@ int main(int argc, char ** argv) { int n_past = 0; + auto * mem = llama_get_memory(ctx); + // fill the KV cache for (int i = 0; i < n_ctx; i += n_batch) { if (i > 0 && n_grp > 1) { @@ -133,10 +135,10 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_self_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_memory_seq_add(mem, 0, n_past - n_batch, n_past, ib*bd); + llama_memory_seq_div(mem, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; + n_past = llama_memory_seq_pos_max(mem, 0) + 1; } common_batch_clear(batch); @@ -166,10 +168,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard); + llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard); - n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; + n_past = llama_memory_seq_pos_max(mem, 0) + 1; common_batch_clear(batch); @@ -195,10 +197,10 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard); + llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard); - n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; + n_past = llama_memory_seq_pos_max(mem, 0) + 1; } } diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 754da1411bcc1..042e12c2bf83a 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_self_clear(ctx); + llama_memory_clear(llama_get_memory(ctx), false); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 760ebbbf08788..db79588f1a5a4 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -196,7 +196,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); // erase whole kv - llama_kv_self_clear(ctx3); + llama_memory_clear(llama_get_memory(ctx3), true); fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 6608d4bea05c8..cf1178043d8d1 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -98,7 +98,7 @@ int main(int argc, char ** argv) { auto generate = [&](const std::string & prompt) { std::string response; - const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0; + const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == -1; // tokenize the prompt const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); @@ -113,7 +113,7 @@ int main(int argc, char ** argv) { while (true) { // check if we have enough space in the context to evaluate this batch int n_ctx = llama_n_ctx(ctx); - int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0); + int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 0783ed4a4c43e..99196c9d047e4 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -217,7 +217,7 @@ int main(int argc, char ** argv) { { LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); - llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1); + llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1); } if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 561c308830351..0adffdb006bcf 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -142,6 +142,8 @@ int main(int argc, char ** argv) { } } + auto * mem_tgt = llama_get_memory(ctx_tgt); + auto * mem_dft = llama_get_memory(ctx_dft); // Tokenize the prompt std::vector inp; @@ -420,14 +422,14 @@ int main(int argc, char ** argv) { { LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); - llama_kv_self_seq_keep(ctx_dft, s_keep); - llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1); - llama_kv_self_seq_keep(ctx_dft, 0); + llama_memory_seq_keep(mem_dft, s_keep); + llama_memory_seq_cp (mem_dft, s_keep, 0, -1, -1); + llama_memory_seq_keep(mem_dft, 0); - llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1); - llama_kv_self_seq_keep(ctx_tgt, s_keep); - llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1); - llama_kv_self_seq_keep(ctx_tgt, 0); + llama_memory_seq_rm (mem_tgt, s_keep, n_past_tgt, -1); + llama_memory_seq_keep(mem_tgt, s_keep); + llama_memory_seq_cp (mem_tgt, s_keep, 0, -1, -1); + llama_memory_seq_keep(mem_tgt, 0); } for (int s = 0; s < n_seq_dft; ++s) { @@ -444,7 +446,7 @@ int main(int argc, char ** argv) { common_batch_clear(batch_dft); common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); - llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1); + llama_memory_seq_rm(mem_dft, 0, n_past_dft, -1); // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); llama_decode(ctx_dft, batch_dft); @@ -503,8 +505,8 @@ int main(int argc, char ** argv) { if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) { LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); - llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1); - llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); + llama_memory_seq_rm(mem_dft, n_seq_cur, -1, -1); + llama_memory_seq_cp(mem_dft, s, n_seq_cur, -1, -1); // all previous tokens from this branch are now also part of the new branch for (int t = 0; t < batch_tgt.n_tokens; ++t) { @@ -585,9 +587,9 @@ int main(int argc, char ** argv) { // evaluate the target model on the drafted tokens { - llama_kv_self_seq_keep(ctx_tgt, 0); + llama_memory_seq_keep(mem_tgt, 0); for (int s = 1; s < n_seq_dft; ++s) { - llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1); + llama_memory_seq_cp(mem_tgt, 0, s, -1, -1); } // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 3d01184a2ee6b..fdc76808ada6a 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}") message(DEBUG "INS_ENB : ${INS_ENB}") option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) -option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) +option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF) option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB}) option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) @@ -131,13 +131,14 @@ option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF) option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF) option(GGML_VXE "ggml: enable vxe" ON) +option(GGML_NNPA "ggml: enable nnpa" ON) option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC") -if (WIN32) +if (MINGW) set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version") endif() @@ -172,6 +173,7 @@ option(GGML_HIP "ggml: use HIP" option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF) option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON) option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) +option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) @@ -205,6 +207,7 @@ option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON) set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING "gmml: OpenCL API version to target") +option(GGML_HEXAGON "ggml: use HEXAGON" OFF) # toolchain for vulkan-shaders-gen set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen") @@ -270,6 +273,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h + include/ggml-hexagon.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") @@ -367,6 +371,8 @@ if (MSVC) /wd4005 # Macro redefinition /wd4244 # Conversion from one type to another type, possible loss of data /wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data + /wd4305 # Conversion from 'type1' to 'type2', possible loss of data + /wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data /wd4996 # Disable POSIX deprecation warnings /wd4702 # Unreachable code warnings ) @@ -386,4 +392,46 @@ if (MSVC) disable_msvc_warnings(ggml-cpu-skylakex) disable_msvc_warnings(ggml-cpu-icelake) disable_msvc_warnings(ggml-cpu-alderlake) + + if (GGML_BUILD_EXAMPLES) + disable_msvc_warnings(common-ggml) + disable_msvc_warnings(common) + + disable_msvc_warnings(mnist-common) + disable_msvc_warnings(mnist-eval) + disable_msvc_warnings(mnist-train) + + disable_msvc_warnings(gpt-2-ctx) + disable_msvc_warnings(gpt-2-alloc) + disable_msvc_warnings(gpt-2-backend) + disable_msvc_warnings(gpt-2-sched) + disable_msvc_warnings(gpt-2-quantize) + disable_msvc_warnings(gpt-2-batched) + + disable_msvc_warnings(gpt-j) + disable_msvc_warnings(gpt-j-quantize) + + disable_msvc_warnings(magika) + disable_msvc_warnings(yolov3-tiny) + disable_msvc_warnings(sam) + + disable_msvc_warnings(simple-ctx) + disable_msvc_warnings(simple-backend) + endif() + + if (GGML_BUILD_TESTS) + disable_msvc_warnings(test-mul-mat) + disable_msvc_warnings(test-arange) + disable_msvc_warnings(test-backend-ops) + disable_msvc_warnings(test-cont) + disable_msvc_warnings(test-conv-transpose) + disable_msvc_warnings(test-conv-transpose-1d) + disable_msvc_warnings(test-conv1d) + disable_msvc_warnings(test-conv2d) + disable_msvc_warnings(test-conv2d-dw) + disable_msvc_warnings(test-customop) + disable_msvc_warnings(test-dup) + disable_msvc_warnings(test-opt) + disable_msvc_warnings(test-pool) + endif () endif() diff --git a/ggml/cmake/common.cmake b/ggml/cmake/common.cmake index bb1ec9b37a7f0..cb66388332040 100644 --- a/ggml/cmake/common.cmake +++ b/ggml/cmake/common.cmake @@ -36,8 +36,7 @@ function(ggml_get_system_arch) (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$")) set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE) - elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR - "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ") + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power") set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE) elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index de77a875ec533..e3b79d09bb66f 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -101,6 +101,7 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_vxe (void); + GGML_BACKEND_API int ggml_cpu_has_nnpa (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); GGML_BACKEND_API int ggml_cpu_has_llamafile (void); diff --git a/ggml/include/ggml-hexagon.h b/ggml/include/ggml-hexagon.h new file mode 100644 index 0000000000000..fe9d4d8e588ba --- /dev/null +++ b/ggml/include/ggml-hexagon.h @@ -0,0 +1,51 @@ + /* + * Copyright (c) 2024-2025 The ggml authors + */ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_HEXAGON_MAX_DEVICES 4 +#define GGML_HEXAGON_BACKEND_NAME "hexagon" + +enum HEXAGONBackend { + HEXAGON_BACKEND_QNNCPU = 0, + HEXAGON_BACKEND_QNNGPU = 1, + HEXAGON_BACKEND_QNNNPU = 2, + HEXAGON_BACKEND_CDSP = 3, + HEXAGON_BACKEND_GGML = 4, //"fake" HEXAGON backend for compare performance between HEXAGON backend and ggml backend +}; + +//0: general approach through QNN:offload ggmlop to QNN(QNNCPU, QNNGPU, QNNNPU) +//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph +//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly +enum hwaccel_approach_type { + HWACCEL_QNN = 0, + HWACCEL_QNN_SINGLEGRAPH= 1, + HWACCEL_CDSP = 2, +}; + +GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path); + +GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend); + +GGML_BACKEND_API int ggml_backend_hexagon_get_device_count(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void); + +GGML_BACKEND_API const char * ggml_backend_hexagon_get_devname(size_t dev_num); + +GGML_BACKEND_API void ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach); + +GGML_BACKEND_API int ggml_backend_hexagon_get_mulmat_algotype(void); + +GGML_BACKEND_API void ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 2226aadcff893..9c4e24023b5ad 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -489,6 +489,7 @@ extern "C" { GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, GGML_OP_PAD_REFLECT_1D, + GGML_OP_ROLL, GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, @@ -1801,6 +1802,17 @@ extern "C" { int p0, int p1); + // Move tensor elements by an offset given for each dimension. Elements that + // are shifted beyond the last position are wrapped around to the beginning. + GGML_API struct ggml_tensor * ggml_roll( + struct ggml_context * ctx, + struct ggml_tensor * a, + int shift0, + int shift1, + int shift2, + int shift3); + + // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // timesteps: [N,] // return: [N, dim] @@ -2095,9 +2107,6 @@ extern "C" { GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node); GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node); - GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); - GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); - // print info and performance information for the graph GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 5681ecddba782..8af27a1f753a6 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -125,7 +125,6 @@ if (NOT MSVC) endif() if (MINGW) - # Target Windows 8 for PrefetchVirtualMemory add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) endif() @@ -196,6 +195,7 @@ add_library(ggml-base ../include/ggml-opt.h ../include/gguf.h ggml.c + ggml.cpp ggml-alloc.c ggml-backend.cpp ggml-opt.cpp @@ -212,6 +212,7 @@ endif() add_library(ggml ggml-backend-reg.cpp) +add_library(ggml::ggml ALIAS ggml) target_link_libraries(ggml PUBLIC ggml-base) @@ -226,6 +227,7 @@ function(ggml_add_backend_library backend) set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) add_dependencies(ggml ${backend}) + install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR}) else() add_library(${backend} ${ARGN}) target_link_libraries(ggml PUBLIC ${backend}) @@ -268,17 +270,27 @@ endfunction() function(ggml_add_cpu_backend_variant tag_name) set(GGML_CPU_TAG_NAME ${tag_name}) # other: OPENMP LLAMAFILE CPU_HBM - foreach (feat NATIVE - SSE42 - AVX AVX2 BMI2 AVX_VNNI FMA F16C - AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 - AMX_TILE AMX_INT8 AMX_BF16) - set(GGML_${feat} OFF) - endforeach() - - foreach (feat ${ARGN}) - set(GGML_${feat} ON) - endforeach() + if (GGML_SYSTEM_ARCH STREQUAL "x86") + foreach (feat NATIVE + SSE42 + AVX AVX2 BMI2 AVX_VNNI FMA F16C + AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 + AMX_TILE AMX_INT8 AMX_BF16) + set(GGML_${feat} OFF) + endforeach() + + foreach (feat ${ARGN}) + set(GGML_${feat} ON) + endforeach() + elseif (GGML_SYSTEM_ARCH STREQUAL "ARM") + foreach (feat ${ARGN}) + set(GGML_INTERNAL_${feat} ON) + endforeach() + elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC") + foreach (feat ${ARGN}) + set(GGML_INTERNAL_${feat} ON) + endforeach() + endif() ggml_add_cpu_backend_variant_impl(${tag_name}) endfunction() @@ -288,6 +300,8 @@ ggml_add_backend(CPU) if (GGML_CPU_ALL_VARIANTS) if (NOT GGML_BACKEND_DL) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") + elseif (GGML_CPU_ARM_ARCH) + message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS") endif() if (GGML_SYSTEM_ARCH STREQUAL "x86") ggml_add_cpu_backend_variant(x64) @@ -301,8 +315,47 @@ if (GGML_CPU_ALL_VARIANTS) # MSVC doesn't support AMX ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) endif() + elseif(GGML_SYSTEM_ARCH STREQUAL "ARM") + if (CMAKE_SYSTEM_NAME MATCHES "Linux") + # Many of these features are optional so we build versions with popular + # combinations and name the backends based on the version they were + # first released with + ggml_add_cpu_backend_variant(armv8.0_1) + ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD) + ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC) + ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE) + ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8) + ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2) + ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME) + ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME) + elseif (CMAKE_SYSTEM_NAME MATCHES "Android") + # Android-specific backends with SoC-compatible feature sets + ggml_add_cpu_backend_variant(android_armv8.0_1) + ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD) + ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC) + ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8) + elseif (APPLE) + ggml_add_cpu_backend_variant(apple_m1 DOTPROD) + ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8) + ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME) + else() + message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}") + endif() + elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC") + if (CMAKE_SYSTEM_NAME MATCHES "Linux") + ggml_add_cpu_backend_variant(power0) + ggml_add_cpu_backend_variant(power7_1 POWER7) + ggml_add_cpu_backend_variant(power7_2 POWER7 VSX) + ggml_add_cpu_backend_variant(power8_1 POWER8) + ggml_add_cpu_backend_variant(power8_2 POWER8 VSX) + ggml_add_cpu_backend_variant(power9 POWER9 VSX) + ggml_add_cpu_backend_variant(power10 POWER10 VSX) + ggml_add_cpu_backend_variant(power11 POWER11 VSX) + else() + message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}") + endif() else() - message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}") + message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}") endif() elseif (GGML_CPU) ggml_add_cpu_backend_variant_impl("") @@ -319,6 +372,7 @@ ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) ggml_add_backend(OpenCL) +ggml_add_backend(HEXAGON) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31514b5..0a39ef7945888 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -65,10 +65,17 @@ #include "ggml-kompute.h" #endif +#ifdef GGML_USE_HEXAGON +#include "ggml-hexagon.h" +#endif + // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wdeprecated-declarations" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wdeprecated-declarations" #endif namespace fs = std::filesystem; @@ -91,6 +98,8 @@ static std::string path_str(const fs::path & path) { #if defined(__clang__) # pragma clang diagnostic pop +#elif defined(__GNUC__) +# pragma GCC diagnostic pop #endif #ifdef _WIN32 @@ -187,6 +196,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif +#ifdef GGML_USE_HEXAGON + register_backend(ggml_backend_hexagon_reg()); +#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif @@ -577,6 +589,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("vulkan", silent, dir_path); ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); + ggml_backend_load_best("hexagon", silent, dir_path); ggml_backend_load_best("cpu", silent, dir_path); // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend const char * backend_path = std::getenv("GGML_BACKEND_PATH"); diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt index 0bf3c05d93a89..76064c3fd1fe8 100644 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ b/ggml/src/ggml-blas/CMakeLists.txt @@ -81,7 +81,7 @@ if (BLAS_FOUND) target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES}) target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS}) else() - message(ERROR "BLAS not found, please refer to " - "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" - " to set correct GGML_BLAS_VENDOR") + message(FATAL_ERROR "BLAS not found, please refer to " + "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" + " to set correct GGML_BLAS_VENDOR") endif() diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 7ef80a4793314..ba2cef0c25fb2 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -37,6 +37,7 @@ #include #include #include +#include #include "../include/ggml-cann.h" #include "../include/ggml.h" @@ -103,6 +104,9 @@ const ggml_cann_device_info& ggml_cann_info(); void ggml_cann_set_device(int32_t device); int32_t ggml_cann_get_device(); +std::optional get_env(const std::string& name); +bool parse_bool(const std::string& value); + /** * @brief Abstract base class for memory pools used by CANN. */ @@ -354,7 +358,8 @@ struct ggml_backend_cann_context { : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) { ggml_cann_set_device(device); description = aclrtGetSocName(); - async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr); + + bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or("")); GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__, device, async_mode ? "ON" : "OFF"); } diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index c0ea26002196f..d1a0ad374d691 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include "ggml-impl.h" #include "ggml-backend-impl.h" @@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() { return id; } +/** + * @brief Get the value of the specified environment variable (name). + * if not empty, return a std::string object + */ +std::optional get_env(const std::string& name) { + const char* val = std::getenv(name.c_str()); + if (!val) return std::nullopt; + std::string res = std::string(val); + std::transform(res.begin(), res.end(), res.begin(), ::tolower); + return res; +} + +/** + * @brief Verify whether the environment variable is a valid value. + */ +bool parse_bool(const std::string& value) { + std::unordered_set valid_values = {"on", "1", "yes", "y", "enable", "true"}; + return valid_values.find(value) != valid_values.end(); +} + /** * @brief Initialize the CANN device information. * @@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { * @param device The device ID to associate with this buffer pool. */ explicit ggml_cann_pool_buf_prio(int device) : device(device) { - disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr; + disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or("")); } /** @@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { * @param device The device ID to associate with this buffer pool. */ explicit ggml_cann_pool_buf(int device) : device(device) { - disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr; + disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or("")); } /** @@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( int device) { - bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr); - if (!disable_vmm && ggml_cann_info().devices[device].vmm) { - GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device); - return std::unique_ptr(new ggml_cann_pool_vmm(device)); - } - bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr); - if (enable_buf_prio) { + std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or(""); + + if (mem_pool_type == "prio") { GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device); return std::unique_ptr(new ggml_cann_pool_buf_prio(device)); } + + if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") { + GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device); + return std::unique_ptr(new ggml_cann_pool_vmm(device)); + } + GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device); return std::unique_ptr(new ggml_cann_pool_buf(device)); } diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 086c822d73a89..fbb04426abe7e 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512) 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, GGML_TABLE_END() +GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16) + -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113, +GGML_TABLE_END() + #define NGRID_IQ1S 2048 #define IQ1S_DELTA 0.125f #define IQ1M_DELTA 0.125f diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index b3237eeadd22b..671fad4d228d4 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,3 +1,17 @@ +function(ggml_add_cpu_backend_features cpu_name arch) + # The feature detection code is compiled as a separate target so that + # it can be built without the architecture flags + # Since multiple variants of the CPU backend may be included in the same + # build, using set_source_files_properties() to set the arch flags is not possible + set(GGML_CPU_FEATS_NAME ${cpu_name}-feats) + add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp) + target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN}) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED) + set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME}) +endfunction() + function(ggml_add_cpu_backend_variant_impl tag_name) if (tag_name) set(GGML_CPU_NAME ggml-cpu-${tag_name}) @@ -10,14 +24,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) list (APPEND GGML_CPU_SOURCES ggml-cpu/ggml-cpu.c ggml-cpu/ggml-cpu.cpp - ggml-cpu/ggml-cpu-aarch64.cpp - ggml-cpu/ggml-cpu-aarch64.h - ggml-cpu/ggml-cpu-hbm.cpp - ggml-cpu/ggml-cpu-hbm.h - ggml-cpu/ggml-cpu-quants.c - ggml-cpu/ggml-cpu-quants.h - ggml-cpu/ggml-cpu-traits.cpp - ggml-cpu/ggml-cpu-traits.h + ggml-cpu/repack.cpp + ggml-cpu/repack.h + ggml-cpu/hbm.cpp + ggml-cpu/hbm.h + ggml-cpu/quants.c + ggml-cpu/quants.h + ggml-cpu/traits.cpp + ggml-cpu/traits.h ggml-cpu/amx/amx.cpp ggml-cpu/amx/amx.h ggml-cpu/amx/mmq.cpp @@ -84,6 +98,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (GGML_SYSTEM_ARCH STREQUAL "ARM") message(STATUS "ARM detected") + list(APPEND GGML_CPU_SOURCES + ggml-cpu/arch/arm/quants.c + ggml-cpu/arch/arm/repack.cpp + ) + if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang") message(FATAL_ERROR "MSVC is not supported for ARM, use clang") else() @@ -138,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name) else() if (GGML_CPU_ARM_ARCH) list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH}) + elseif(GGML_CPU_ALL_VARIANTS) + # Begin with the lowest baseline + set(ARM_MCPU "armv8-a") + set(ARCH_TAGS "") + set(ARCH_DEFINITIONS "") + + # When a feature is selected, bump the MCPU to the first + # version that supported it + if (GGML_INTERNAL_DOTPROD) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+dotprod") + list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD) + endif() + if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+fp16") + list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC) + endif() + if (GGML_INTERNAL_SVE) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+sve") + list(APPEND ARCH_DEFINITIONS GGML_USE_SVE) + endif() + if (GGML_INTERNAL_MATMUL_INT8) + set(ARM_MCPU "armv8.6-a") + set(ARCH_TAGS "${ARCH_TAGS}+i8mm") + list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8) + endif() + if (GGML_INTERNAL_SVE2) + set(ARM_MCPU "armv8.6-a") + set(ARCH_TAGS "${ARCH_TAGS}+sve2") + list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2) + endif() + if (GGML_INTERNAL_NOSVE) + set(ARCH_TAGS "${ARCH_TAGS}+nosve") + endif() + if (GGML_INTERNAL_SME) + set(ARM_MCPU "armv9.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+sme") + list(APPEND ARCH_DEFINITIONS GGML_USE_SME) + endif() + list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}") + ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS}) endif() endif() @@ -167,6 +229,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() elseif (GGML_SYSTEM_ARCH STREQUAL "x86") message(STATUS "x86 detected") + list(APPEND GGML_CPU_SOURCES + ggml-cpu/arch/x86/quants.c + ggml-cpu/arch/x86/repack.cpp + ) + if (MSVC) # instruction set detection for MSVC only if (GGML_NATIVE) @@ -296,21 +363,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS") endif() - - # The feature detection code is compiled as a separate target so that - # it can be built without the architecture flags - # Since multiple variants of the CPU backend may be included in the same - # build, using set_source_files_properties() to set the arch flags is not possible - set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats) - add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp) - target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include) - target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS}) - target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED) - set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME}) + ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS}) endif() elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC") message(STATUS "PowerPC detected") + list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c) if (GGML_NATIVE) if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") file(READ "/proc/cpuinfo" POWER10_M) @@ -318,7 +375,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name) execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M) endif() - string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}") + string(TOUPPER "${POWER10_M}" POWER10_M_UPPER) + string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}") string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}") if (EXTRACTED_NUMBER GREATER_EQUAL 10) @@ -330,6 +388,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name) else() list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64) endif() + elseif(GGML_CPU_ALL_VARIANTS) + # Begin with the lowest baseline + set(ARCH_DEFINITIONS "") + + # When a feature is selected, bump the MCPU to the first + # version that supported it + foreach(PVER RANGE 7 11) + if(DEFINED GGML_INTERNAL_POWER${PVER}) + set(POWERPC_MCPU "power${PVER}") + list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER}) + endif() + endforeach() + if (GGML_INTERNAL_VSX) + list(APPEND ARCH_DEFINITIONS GGML_USE_VSX) + list(APPEND ARCH_FLAGS -mvsx) + endif() + + if (DEFINED POWERPC_MCPU) + list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU}) + endif() + ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS}) else() if (GGML_CPU_POWERPC_CPUTYPE) list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE}) @@ -337,6 +416,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64") message(STATUS "loongarch64 detected") + list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c) + list(APPEND ARCH_FLAGS -march=loongarch64) if (GGML_LASX) list(APPEND ARCH_FLAGS -mlasx) @@ -346,6 +427,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64") message(STATUS "riscv64 detected") + list(APPEND GGML_CPU_SOURCES + ggml-cpu/arch/riscv/quants.c + ggml-cpu/arch/riscv/repack.cpp + ) if (GGML_RVV) if (GGML_XTHEADVECTOR) list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d) @@ -357,11 +442,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() elseif (GGML_SYSTEM_ARCH STREQUAL "s390x") message(STATUS "s390x detected") + list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c) file(READ "/proc/cpuinfo" CPUINFO_CONTENTS) string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS}) # TODO: Separation to determine activation of VX/VXE/VXE2 if (${S390X_M} MATCHES "8561|8562") + set(GGML_NNPA OFF) message(STATUS "z15 target") list(APPEND ARCH_FLAGS -march=z15) elseif (${S390X_M} MATCHES "3931") @@ -378,14 +465,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() if (GGML_VXE) + message(STATUS "VX/VXE/VXE2 enabled") list(APPEND ARCH_FLAGS -mvx -mzvector) + list(APPEND ARCH_DEFINITIONS GGML_VXE) + endif() + + if (GGML_NNPA) + message(STATUS "NNPA enabled") + list(APPEND ARCH_DEFINITIONS GGML_NNPA) endif() + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm") + message(STATUS "Wasm detected") + list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c) else() - message(STATUS "Unknown architecture") + message(WARNING "Unknown CPU architecture. Falling back to generic implementations.") + list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC) endif() - if (GGML_CPU_AARCH64) - target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64) + if (GGML_CPU_REPACK) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK) endif() if (GGML_CPU_KLEIDIAI) @@ -396,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_TAG "v1.6.0") + set(KLEIDIAI_COMMIT_TAG "v1.9.0") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "75b4ad68f25ab673dcc01065e5a0b05f") + set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 0f067137df006..258857b00754a 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -5,7 +5,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-cpu.h" -#include "ggml-cpu-traits.h" +#include "traits.h" #if defined(__gnu_linux__) #include diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index 0ea91596bc7e2..47c61b88164b8 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -8,7 +8,8 @@ #include "mmq.h" #include "ggml-impl.h" #include "ggml-cpu-impl.h" -#include "ggml-cpu-quants.h" +#include "simd-mappings.h" +#include "quants.h" #include "ggml-quants.h" #include #include @@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_ // Quantize these floats const float iscale = 127.f / amax; - y[i].d = GGML_FP32_TO_FP16(1 / iscale); + y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale); const float id = ( amax != 0.0f ) ? iscale : 0.f; const __m512 vscale = _mm512_set1_ps(id); @@ -1090,7 +1091,7 @@ struct acc_C { const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1113,8 +1114,8 @@ struct acc_C { const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half)))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); - const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); + const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1137,7 +1138,7 @@ struct acc_C { const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni for (int k = 0; k < 8; ++k) { va[k] = _mm512_set1_epi32(a_ptr[k]); } - vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); - vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s)); + vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d)); + vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s)); } // load b @@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni +#elif defined(__APPLE__) +#include +#endif + +#if !defined(HWCAP2_I8MM) +#define HWCAP2_I8MM (1 << 13) +#endif + +#if !defined(HWCAP2_SME) +#define HWCAP2_SME (1 << 23) +#endif + +struct aarch64_features { + // has_neon not needed, aarch64 has NEON guaranteed + bool has_dotprod = false; + bool has_fp16_va = false; + bool has_sve = false; + bool has_sve2 = false; + bool has_i8mm = false; + bool has_sme = false; + + aarch64_features() { +#if defined(__linux__) + uint32_t hwcap = getauxval(AT_HWCAP); + uint32_t hwcap2 = getauxval(AT_HWCAP2); + + has_dotprod = !!(hwcap & HWCAP_ASIMDDP); + has_fp16_va = !!(hwcap & HWCAP_FPHP); + has_sve = !!(hwcap & HWCAP_SVE); + has_sve2 = !!(hwcap2 & HWCAP2_SVE2); + has_i8mm = !!(hwcap2 & HWCAP2_I8MM); + has_sme = !!(hwcap2 & HWCAP2_SME); +#elif defined(__APPLE__) + int oldp = 0; + size_t size = sizeof(oldp); + + if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) { + has_dotprod = static_cast(oldp); + } + + if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) { + has_i8mm = static_cast(oldp); + } + + if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) { + has_sme = static_cast(oldp); + } + + // Apple apparently does not implement SVE yet +#endif + } +}; + +static int ggml_backend_cpu_aarch64_score() { + int score = 1; + aarch64_features af; + +#ifdef GGML_USE_DOTPROD + if (!af.has_dotprod) { return 0; } + score += 1<<1; +#endif +#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC + if (!af.has_fp16_va) { return 0; } + score += 1<<2; +#endif +#ifdef GGML_USE_SVE + if (!af.has_sve) { return 0; } + score += 1<<3; +#endif +#ifdef GGML_USE_MATMUL_INT8 + if (!af.has_i8mm) { return 0; } + score += 1<<4; +#endif +#ifdef GGML_USE_SVE2 + if (!af.has_sve2) { return 0; } + score += 1<<5; +#endif +#ifdef GGML_USE_SME + if (!af.has_sme) { return 0; } + score += 1<<6; +#endif + + return score; +} + +GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score) + +# endif // defined(__aarch64__) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c new file mode 100644 index 0000000000000..3e2d3d03d67ec --- /dev/null +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -0,0 +1,4114 @@ +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" +#include "ggml-quants.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "simd-mappings.h" + +#include "../../quants.h" +#include "../../ggml-cpu-impl.h" + +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#define UNUSED GGML_UNUSED + +#if defined(__ARM_NEON) +#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s +#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) +#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) +#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) +#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) +#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) +#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) +#define B8(c,s ) B7(c,s, c), B7(c,s, s) + +// precomputed tables for expanding 8bits to 8 bytes: +static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 +static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 +#endif + +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + } + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_ref(x, y, k); +#endif +} + +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * GGML_RESTRICT y = vy; +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + int32x4_t accv = vdupq_n_s32(0); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + + accv = vaddq_s32(accv, vi); + } + + y[i].s = GGML_CPU_FP32_TO_FP16(d * vaddvq_s32(accv)); + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_1_ref(x, y, k); +#endif +} + +// placeholder implementation for Apple targets +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q8_K_ref(x, y, k); +} + +//===================================== Dot products ================================= + +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_0 * GGML_RESTRICT vx0 = vx; + const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx); + const block_q8_0 * GGML_RESTRICT vy0 = vy; + const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i]; + const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i]; + const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i]; + const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); + const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t x0_l = vsubq_s8(v0_0l, s8b); + const int8x16_t x0_h = vsubq_s8(v0_0h, s8b); + const int8x16_t x1_l = vsubq_s8(v0_1l, s8b); + const int8x16_t x1_h = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + float32_t _scale[4] = { + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) + }; + float32x4_t scale = vld1q_f32(_scale); + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + + float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + vst1_f32(s, vget_low_f32 (sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + + return; + } +#endif + + int ib = 0; + float sumf = 0; + +#if defined(__ARM_FEATURE_SVE) + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); + + const int vector_length = ggml_cpu_get_sve_cnt()*8; + + // VLA Implementation using switch case + switch (vector_length) { + case 128: + { + // predicate for activating higher lanes for 4 float32 elements + const svbool_t ph4 = svptrue_pat_b32(SV_VL4); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F)); + const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04)); + const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F)); + const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04)); + + // sub 8 + const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8); + const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8); + const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8); + const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8); + + // load y + const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16); + const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs); + const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16); + + // dot product + sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4, + svdot_s32(svdup_n_s32(0), qx0ls, qy0l), + svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4, + svdot_s32(svdup_n_s32(0), qx1ls, qy1l), + svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 256: + { + // predicate for activating higher lanes for 16 int8 elements + const svbool_t ph16 = svptrue_pat_b8(SV_VL16); + // predicate for activating lower lanes for 16 int8 elements + const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 512: + { + // predicate for activating higher lanes for 32 int8 elements + const svbool_t ph32 = svptrue_pat_b8(SV_VL32); + + // predicate for activating higher lanes for 16 int8 elements + const svbool_t ph16 = svptrue_pat_b8(SV_VL16); + // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes + const svbool_t pl16 = svnot_b_z(ph32, ph16); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs); + const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(ph32, y0->qs); + const svint8_t qy1 = svld1_s8(ph32, y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32, + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32, + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1)); + } break; + default: + assert(false && "Unsupported vector length"); + break; + } + +#elif defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + // dot product into int32x4_t + const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; +} + +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_1 * GGML_RESTRICT vx0 = vx; + const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx); + const block_q8_1 * GGML_RESTRICT vy0 = vy; + const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by); + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t summs0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i]; + const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i]; + const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i]; + const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i]; + + float32_t summs_t[4] = { + GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y0->s), + GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y0->s), + GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y1->s), + GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y1->s) + }; + summs0 = vaddq_f32(summs0, vld1q_f32(summs_t)); + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); + const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); + + // 4-bit -> 8-bit + const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + // mmla into int32x4_t + float32_t _scale[4] = { + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) + }; + float32x4_t scale = vld1q_f32(_scale); + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + + float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + sumv2 = vaddq_f32(sumv2, summs0); + + vst1_f32(s, vget_low_f32 (sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + + return; + } +#endif + + int ib = 0; + float sumf = 0; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs = 0; + + for (; ib + 1 < nb; ib += 2) { + const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; + + summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s) + GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + // dot product into int32x4_t + const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); + const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F); + const int v1 = (x[ib].qs[j] >> 4); + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + for (; ib + 1 < nb; ib += 2) { + const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + // extract the 5th bit via lookup table ((!b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_1[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_1[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; + } + + *s = sumf; +} + +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs0 = 0.0f; + float summs1 = 0.0f; + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + for (; ib + 1 < nb; ib += 2) { + const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; + const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; + const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); + + // extract the 5th bit via lookup table ((b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_0[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_0[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit + const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q8_0 * GGML_RESTRICT vx0 = vx; + const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx); + const block_q8_0 * GGML_RESTRICT vy0 = vy; + const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i]; + const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i]; + + const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i]; + const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i]; + + const int8x16_t x0_l = vld1q_s8(b_x0->qs); + const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16); + const int8x16_t x1_l = vld1q_s8(b_x1->qs); + const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + float32_t _scale[4] = { + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) + }; + float32x4_t scale = vld1q_f32(_scale); + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + + float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + vst1_f32(s, vget_low_f32 (sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + + return; + } +#endif + + int ib = 0; + float sumf = 0; + +#if defined(__ARM_FEATURE_SVE) + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); + + const int vector_length = ggml_cpu_get_sve_cnt()*8; + + //VLA Implemenation for SVE + switch (vector_length) { + case 128: + { + // predicate for activating lanes for 16 Int8 elements + const svbool_t ph16 = svptrue_pat_b8 (SV_VL16); + const svbool_t pl16 = svptrue_pat_b32(SV_VL4); + + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + // load x + const svint8_t qx0_0 = svld1_s8(ph16, x0->qs); + const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16); + const svint8_t qx1_0 = svld1_s8(ph16, x1->qs); + const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16); + + // load y + const svint8_t qy0_0 = svld1_s8(ph16, y0->qs); + const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16); + const svint8_t qy1_0 = svld1_s8(ph16, y1->qs); + const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16); + + sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16, + svdot_s32(svdup_n_s32(0), qx0_0, qy0_0), + svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16, + svdot_s32(svdup_n_s32(0), qx1_0, qy1_0), + svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1)); + } break; + case 256: + { + //printf("sve256"); + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + // load x + const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); + const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 512: + { + // predicate for activating high 256 bit + const svbool_t ph32 = svptrue_pat_b8(SV_VL32); + // predicate for activating low 256 bit + const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32); + + // predicate for activating high lanes for 8 float32 elements + const svbool_t ph8 = svptrue_pat_b32(SV_VL8); + // predicate for activating low lanes for 8 float32 elements + const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8); + + svfloat32_t sumv00 = svdup_n_f32(0.0f); + + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits + // and add them to make one 64 element vector + // load x + const svint8_t qx_32 = svld1_s8(ph32, x0->qs); + svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2); + + qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64); + + // load y + const svint8_t qy_32 = svld1_s8(ph32, y0->qs); + svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2); + + qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64); + + // scale creation + const float32_t deq1 = GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d); + const float32_t deq2 = GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d); + + // duplicate deq1 in first half of vector and deq2 in second half of vector + const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2); + + const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64)); + + sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp); + } + + sumf = svaddv_f32(svptrue_b32(), sumv00); + break; + } + default: + assert(false && "Unsupported vector length"); + break; + } +#elif defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + const int8x16_t x0_0 = vld1q_s8(x0->qs); + const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); + const int8x16_t x1_0 = vld1q_s8(x1->qs); + const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); + + // load y + const int8x16_t y0_0 = vld1q_s8(y0->qs); + const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); + const int8x16_t y1_0 = vld1q_s8(y1->qs); + const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); + } + + sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#endif + for (; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j]*y[ib].qs[j]; + } + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq1_0 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + float sumf = 0.0f; + + uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27}; + + const uint8x16_t shift = vld1q_u8(k_shift); + + for (int i = 0; i < nb; ++i) { +#if defined(__ARM_FEATURE_DOTPROD) + int32x4_t sumi0 = vdupq_n_s32(0); + int32x4_t sumi1 = vdupq_n_s32(0); +#else + int16x8_t sumi0 = vdupq_n_s16(0); + int16x8_t sumi1 = vdupq_n_s16(0); +#endif + + // first 32 bytes of 5 elements + { + uint8x16_t qx0 = vld1q_u8(x[i].qs + 0); + uint8x16_t qx1 = vld1q_u8(x[i].qs + 16); + uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3)); + uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3)); + uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9)); + uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9)); + uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27)); + uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27)); + uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81)); + uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81)); + + // multiply by 3 and keep the 2 bits above 8 bits + int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); + int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6)); + int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6)); + int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6)); + int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + 0); + const int8x16_t qy1 = vld1q_s8(y[i].qs + 16); + const int8x16_t qy2 = vld1q_s8(y[i].qs + 32); + const int8x16_t qy3 = vld1q_s8(y[i].qs + 48); + const int8x16_t qy4 = vld1q_s8(y[i].qs + 64); + const int8x16_t qy5 = vld1q_s8(y[i].qs + 80); + const int8x16_t qy6 = vld1q_s8(y[i].qs + 96); + const int8x16_t qy7 = vld1q_s8(y[i].qs + 112); + const int8x16_t qy8 = vld1q_s8(y[i].qs + 128); + const int8x16_t qy9 = vld1q_s8(y[i].qs + 144); + +#if defined(__ARM_FEATURE_DOTPROD) + sumi0 = vdotq_s32(sumi0, sqx0, qy0); + sumi1 = vdotq_s32(sumi1, sqx1, qy1); + sumi0 = vdotq_s32(sumi0, sqx2, qy2); + sumi1 = vdotq_s32(sumi1, sqx3, qy3); + sumi0 = vdotq_s32(sumi0, sqx4, qy4); + sumi1 = vdotq_s32(sumi1, sqx5, qy5); + sumi0 = vdotq_s32(sumi0, sqx6, qy6); + sumi1 = vdotq_s32(sumi1, sqx7, qy7); + sumi0 = vdotq_s32(sumi0, sqx8, qy8); + sumi1 = vdotq_s32(sumi1, sqx9, qy9); +#else + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9)); +#endif + } + + // last 16 bytes of 5-element, along with the 4 bytes of 4 elements + { + uint8x16_t qx0 = vld1q_u8(x[i].qs + 32); + uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3)); + uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9)); + uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27)); + uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81)); + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned + uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh)); + qx5 = vmulq_u8(qx5, shift); + + // multiply by 3 and keep the 2 bits above 8 bits + int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + 160); + const int8x16_t qy1 = vld1q_s8(y[i].qs + 176); + const int8x16_t qy2 = vld1q_s8(y[i].qs + 192); + const int8x16_t qy3 = vld1q_s8(y[i].qs + 208); + const int8x16_t qy4 = vld1q_s8(y[i].qs + 224); + const int8x16_t qy5 = vld1q_s8(y[i].qs + 240); + +#if defined(__ARM_FEATURE_DOTPROD) + sumi0 = vdotq_s32(sumi0, sqx0, qy0); + sumi1 = vdotq_s32(sumi1, sqx1, qy1); + sumi0 = vdotq_s32(sumi0, sqx2, qy2); + sumi1 = vdotq_s32(sumi1, sqx3, qy3); + sumi0 = vdotq_s32(sumi0, sqx4, qy4); + sumi1 = vdotq_s32(sumi1, sqx5, qy5); +#else + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); +#endif + } + + const int16x8_t ysum0 = vld1q_s16(y[i].bsums); + const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + +#if defined(__ARM_FEATURE_DOTPROD) + sumi0 = vaddq_s32(sumi0, sumi1); + sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); + + sumf += d * (float) vaddvq_s32(sumi0); +#else + sumi0 = vaddq_s16(sumi0, sumi1); + sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1)); + + sumf += d * (float) vaddlvq_s16(sumi0); +#endif + } + + *s = sumf; + +#else + const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; + + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int sum = 0; + + for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = x[i].qs[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*32 + m]; + } + } + } + for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = x[i].qs[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*16 + m]; + } + } + } + + for (size_t l = 0; l < 4; ++l) { + for (size_t j = 0; j < sizeof(x->qh); ++j) { + uint8_t q = x[i].qh[j] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j]; + } + } + + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq2_0 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + float sumf = 0.0f; + + const uint8x16_t m3 = vdupq_n_u8(3); + + for (int i = 0; i < nb; ++i) { +#if defined(__ARM_FEATURE_DOTPROD) + int32x4_t sumi0 = vdupq_n_s32(0); + int32x4_t sumi1 = vdupq_n_s32(0); +#else + int16x8_t sumi0 = vdupq_n_s16(0); + int16x8_t sumi1 = vdupq_n_s16(0); +#endif + + for (size_t j = 0; j < sizeof(x->qs); j += 32) { + uint8x16_t qx0 = vld1q_u8(x[i].qs + j); + uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16); + uint8x16_t qx2 = vshrq_n_u8(qx0, 2); + uint8x16_t qx3 = vshrq_n_u8(qx1, 2); + uint8x16_t qx4 = vshrq_n_u8(qx0, 4); + uint8x16_t qx5 = vshrq_n_u8(qx1, 4); + uint8x16_t qx6 = vshrq_n_u8(qx0, 6); + uint8x16_t qx7 = vshrq_n_u8(qx1, 6); + + int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3)); + int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3)); + int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3)); + int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3)); + int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3)); + int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3)); + int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3)); + int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3)); + + const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 + 0); + const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 + 16); + const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 + 32); + const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 + 48); + const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 + 64); + const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 + 80); + const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 + 96); + const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112); + +#if defined(__ARM_FEATURE_DOTPROD) + sumi0 = vdotq_s32(sumi0, sqx0, qy0); + sumi1 = vdotq_s32(sumi1, sqx1, qy1); + sumi0 = vdotq_s32(sumi0, sqx2, qy2); + sumi1 = vdotq_s32(sumi1, sqx3, qy3); + sumi0 = vdotq_s32(sumi0, sqx4, qy4); + sumi1 = vdotq_s32(sumi1, sqx5, qy5); + sumi0 = vdotq_s32(sumi0, sqx6, qy6); + sumi1 = vdotq_s32(sumi1, sqx7, qy7); +#else + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6)); + sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); + sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7)); +#endif + } + + const int16x8_t ysum0 = vld1q_s16(y[i].bsums); + const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + +#if defined(__ARM_FEATURE_DOTPROD) + sumi0 = vaddq_s32(sumi0, sumi1); + sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); + + sumf += d * (float) vaddvq_s32(sumi0); +#else + sumi0 = vaddq_s16(sumi0, sumi1); + sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1)); + + sumf += d * (float) vaddlvq_s16(sumi0); +#endif + } + + *s = sumf; + +#else + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int32_t sumi = 0; + + for (size_t j = 0; j < sizeof(x->qs); j += 32) { + for (size_t l = 0; l < 4; ++l) { + for (size_t k = 0; k < 32; ++k) { + sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1); + } + } + } + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + sumf += (float) sumi * d; + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#ifdef __ARM_FEATURE_SVE + const int vector_length = svcntb()*8; + const svuint8_t m3s = svdup_n_u8(0x3); + const svuint32_t m4s = svdup_n_u32(0xF); + const svint32_t vzero_sv = svdup_n_s32(0); + svfloat32_t acc_sum = svdup_n_f32(0); + svbool_t pred_s32 = svptrue_pat_b32(SV_VL4); + + switch (vector_length) { + case 128: + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + svfloat32_t d_broad = svdup_n_f32((float32_t)d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); + + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8_sv = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + + svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc); + const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); + + mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4); + const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); + + svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums); + svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4); + + const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2)); + + mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8); + const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); + + mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12); + const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); + + q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8); + q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12); + + svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2)); + + svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1)); + + acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad); + + svint32_t sumi1 = svdup_n_s32(0); + + { + const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2); + svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s)); + svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s)); + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0)); + + const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16); + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3)); + + + const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3)); + + //------------------------------- + + q2 += 32; + const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s)); + const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0)); + + const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16); + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1)); + + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3)); + + + const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1)); + + + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2)); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s)); + q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3)); + } + acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad); + } + *s = svaddv_f32(svptrue_b32(), acc_sum); + break; + + case 256: + case 512: + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + svfloat32_t d_broad = svdup_n_f32((float32_t)d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); + + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8_sv = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + + const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8; + const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s)); + const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4)); + svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums); + + const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); + const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s)); + const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4)); + + svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8); + + svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2))); + + acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad); + + svint32_t sumi1 = svdup_n_s32(0); + + { + const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2); + svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s)); + svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1)); + sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s)); + q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3)); + sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s)); + q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5)); + sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s)); + q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7)); + sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2); + + q2 += 32; + + const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2); + q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s)); + q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1)); + sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s)); + q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3)); + sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s)); + q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5)); + sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1); + + q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s)); + q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7)); + sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2); + } + acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad); + } + *s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum); + break; + + default: + assert(false && "Unsupported vector length"); + break; + } + +#elif __ARM_NEON + const uint8x16_t m3 = vdupq_n_u8(0x3); + const uint8x16_t m4 = vdupq_n_u8(0xF); + + const int32x4_t vzero = vdupq_n_s32(0); + + ggml_int8x16x2_t q2bytes; + uint8_t aux[16]; + + float sum = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + + const uint8x16_t mins_and_scales = vld1q_u8(sc); + const uint8x16_t scales = vandq_u8(mins_and_scales, m4); + vst1q_u8(aux, scales); + + const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4); + const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); + const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}}; + const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])), + vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0]))); + const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])), + vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1]))); + sum += dmin * vaddvq_s32(vaddq_s32(s0, s1)); + + int isum = 0; + int is = 0; + +// We use this macro instead of a function call because for some reason +// the code runs 2-3% slower, even if the function is declared inline +#define MULTIPLY_ACCUM_WITH_SCALE(index)\ + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; + +#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\ + q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\ + q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\ + q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\ + MULTIPLY_ACCUM_WITH_SCALE((index)); + + for (int j = 0; j < QK_K/128; ++j) { + const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32; + + ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; + q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3)); + q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3)); + + MULTIPLY_ACCUM_WITH_SCALE(0); + + SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2); + SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4); + SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6); + + is += 8; + } + + sum += d * isum; + } + + *s = sum; + +#else + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int isum = 0; + int is = 0; + int d; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + d = sc[is++] & 0xF; + int isuml = 0; + for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + d = sc[is++] & 0xF; + isuml = 0; + for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * isum - dmin * summs; + } + *s = sumf; +#endif +} + +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_FEATURE_SVE) + + uint32_t aux[3]; + uint32_t utmp[4]; + + const int8_t m32 = 32; + const int vector_length = svcntb()*8; + const svuint8_t m3b_sv = svdup_n_u8(0x3); + const svint32_t vzero_sv = svdup_n_s32(0); + + const svuint8_t m0_sv = svdup_n_u8(1); + const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1); + const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2); + const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3); + + float sum = 0; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q3_sv = x[i].qs; + const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask; + const int8_t * GGML_RESTRICT q8_sv = y[i].qs; + + // Set up scales + memcpy(aux, x[i].scales, 12); + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + int8_t * scale = (int8_t *)utmp; + + for (int j = 0; j < 16; ++j) scale[j] -= m32; + + switch (vector_length) { + case 128: + { + svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv); + svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16); + svuint8_t q3h_sv; + + svint32_t sumi1_1 = svdup_n_s32(0); + svint8_t q3bytes_sv; + + for (int j = 0; j < QK_K/128; ++j) { + + const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16; + const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16; + svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2); + q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0])); + + q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2); + q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1])); + + q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1); + q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2])); + + q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1); + q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3])); + + + scale += 4; + q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1); + q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0])); + + q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2); + q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1])); + + + q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; + + q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1); + q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2])); + + q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1); + q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3])); + + if (j == 0) { + qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4); + qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4); + } + + scale += 4; + } + + sum += d * (svaddv_s32(svptrue_b32(), sumi1_1)); + } break; + case 256: + case 512: + { + svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv); + svuint8_t q3h_sv; + + svint32_t sumi1_1 = svdup_n_s32(0); + svint8_t q3bytes_sv; + + for (int j = 0; j < QK_K/128; ++j) { + + const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32; + svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2); + q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + + svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1])); + sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1); + + q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1); + q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3])); + sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1); + + scale += 4; + q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; + + q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv); + q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1])); + sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1); + + q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1); + q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); + + scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3])); + sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1); + + if (j == 0) { + qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4); + } + + scale += 4; + } + + sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1)); + } break; + default: + assert(false && "Unsupported vector length"); + break; + } + } + *s = sum; + +#elif __ARM_NEON + + uint32_t aux[3]; + uint32_t utmp[4]; + + const uint8x16_t m3b = vdupq_n_u8(0x3); + const int32x4_t vzero = vdupq_n_s32(0); + + const uint8x16_t m0 = vdupq_n_u8(1); + const uint8x16_t m1 = vshlq_n_u8(m0, 1); + const uint8x16_t m2 = vshlq_n_u8(m0, 2); + const uint8x16_t m3 = vshlq_n_u8(m0, 3); + const int8_t m32 = 32; + + ggml_int8x16x4_t q3bytes; + + float sum = 0; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); + + ggml_uint8x16x4_t q3h; + + int32_t isum = 0; + + // Set up scales + memcpy(aux, x[i].scales, 12); + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + int8_t * scale = (int8_t *)utmp; + for (int j = 0; j < 16; ++j) scale[j] -= m32; + + for (int j = 0; j < QK_K/128; ++j) { + + const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32; + const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64; + const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64; + + q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2); + q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2); + q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1); + q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1); + + q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0])); + q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1])); + q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); + q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); + + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; + + scale += 4; + + q3h.val[0] = vbicq_u8(m2, qhbits.val[0]); + q3h.val[1] = vbicq_u8(m2, qhbits.val[1]); + q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1); + q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1); + + q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0])); + q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1])); + q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); + q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); + + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; + + scale += 4; + + if (j == 0) { + qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4); + qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4); + } + + } + sum += d * isum; + + } + + *s = sum; + +#else + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; + + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +#endif + +} + +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); +#ifdef __ARM_FEATURE_MATMUL_INT8 + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_K * GGML_RESTRICT x0 = x; + const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx); + const block_q8_K * GGML_RESTRICT y0 = y; + const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by); + + const uint8x16_t m4b = vdupq_n_u8(0x0f); + + float32x4_t vfsum = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) { + const uint8_t * GGML_RESTRICT qx0 = x0->qs; + const uint8_t * GGML_RESTRICT qx1 = x1->qs; + const int8_t * GGML_RESTRICT qy0 = y0->qs; + const int8_t * GGML_RESTRICT qy1 = y1->qs; + + // decode scales and mins + int8_t x0_scales[8], x1_scales[8]; + int16x8_t x0_mins, x1_mins; + { + uint32_t scales_mins[3]; + memcpy(scales_mins, x0->scales, 12); + const uint32_t mins_0_3 = scales_mins[1] & kmask1; + const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4); + const uint32x2_t mins = {mins_0_3, mins_4_7}; + x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins))); + uint32_t scales[2]; + scales[0] = scales_mins[0] & kmask1; // scales 0~3 + scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7 + memcpy(x0_scales, scales, 8); + } + { + uint32_t scales_mins[3]; + memcpy(scales_mins, x1->scales, 12); + const uint32_t mins_0_3 = scales_mins[1] & kmask1; + const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4); + const uint32x2_t mins = {mins_0_3, mins_4_7}; + x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins))); + uint32_t scales[2]; + scales[0] = scales_mins[0] & kmask1; // scales 0~3 + scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7 + memcpy(x1_scales, scales, 8); + } + + int32x4_t visum = {0}; + + // process 64 data points per iteration, totally 256 data points + for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) { + const int8x16x4_t vy0 = vld1q_s8_x4(qy0); + const int8x16x4_t vy1 = vld1q_s8_x4(qy1); + + int8x16_t vx0[4], vx1[4]; + { + const uint8x16x2_t vv = vld1q_u8_x2(qx0); + vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b)); + vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b)); + vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4)); + vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4)); + } + { + const uint8x16x2_t vv = vld1q_u8_x2(qx1); + vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b)); + vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b)); + vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4)); + vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4)); + } + + // process 32 data points (share same block scale) per iteration + for (int k = 0; k < 2; ++k) { + const int blk = j * 2 + k; + const int32x4_t block_scale = { + x0_scales[blk], + x0_scales[blk], + x1_scales[blk], + x1_scales[blk], + }; + + int32x4_t vr = {0}; + for (int l = 0; l < 2; ++l) { + const int idx = k * 2 + l; + const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]); + const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]); + const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]); + const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]); + const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64)); + const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64)); + const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64)); + const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64)); + vr = vmmlaq_s32(vr, vx_l, vy_l); + vr = vmmlaq_s32(vr, vx_h, vy_h); + } + // apply block scale, will NOT overflow + // block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits + visum = vmlaq_s32(visum, vr, block_scale); + } + } + + // adjust bias, apply superblock scale + { + int32_t bias[4]; + // no obvious uplift from sve sdot-16, just use neon mul add + const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8)); + const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8)); + bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)), + vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins)))); + bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)), + vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins)))); + bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)), + vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins)))); + bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)), + vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins)))); + const float32x4_t dmins = { + GGML_CPU_FP16_TO_FP32(x0->dmin) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->dmin) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->dmin) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->dmin) * y1->d, + }; + vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins); + + const float32x4_t superblock_scale = { + GGML_CPU_FP16_TO_FP32(x0->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y1->d, + }; + vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale); + } + } + + // vfsum = ABCD -> ACBD + // AC -> s, BD -> (s+bs) + vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2)); + vst1_f32(s, vget_low_f32 (vfsum)); + vst1_f32(s + bs, vget_high_f32(vfsum)); + + return; + } +#endif + +#ifdef __ARM_FEATURE_SVE + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); + + memcpy(utmp, x[i].scales, K_SCALE_SIZE); + + uint32x2_t mins8 = { 0 }; + mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0); + mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1); + + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[0] &= kmask1; + + const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8))); + const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)), + vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins))); + sumf -= dmin * vaddvq_s32(prod); + + const uint8_t * scales = (const uint8_t *)utmp; + + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const int vector_length = ggml_cpu_get_sve_cnt()*8; + const svuint8_t m4b = svdup_n_u8(0xf); + const svint32_t mzero = svdup_n_s32(0); + svint32_t sumi1 = svdup_n_s32(0); + svint32_t sumi1_1 = svdup_n_s32(0); + svint32_t sumi1_2 = svdup_n_s32(0); + svint32_t sumi2 = svdup_n_s32(0); + svint32_t sumi2_1 = svdup_n_s32(0); + svint32_t sumi2_2 = svdup_n_s32(0); + switch (vector_length) { + case 128: + { + for (int j = 0; j < QK_K/64; ++j) { + svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b)); + svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; + sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); + q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b)); + q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; + sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); + + q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4)); + q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; + sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); + q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4)); + q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; + sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); + q4 += 32; + } + sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2); + sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2); + sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2))); + } break; + case 256: + case 512: + { + for (int j = 0; j < QK_K/64; ++j) { + const svuint8_t q4bits = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32; + svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b)); + svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32; + sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); + + q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4)); + q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32; + sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); + } + sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2))); + } break; + default: + assert(false && "Unsupported vector length"); + break; + } + } + *s = sumf; +#elif defined __ARM_NEON + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int32x4_t mzero = vdupq_n_s32(0); + + ggml_int8x16x2_t q4bytes; + ggml_int8x16x2_t q8bytes; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); + + memcpy(utmp, x[i].scales, 12); + + uint32x2_t mins8 = { 0 }; + mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0); + mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1); + + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[0] &= kmask1; + + const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8))); + const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)), + vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins))); + sumf -= dmin * vaddvq_s32(prod); + + const uint8_t * scales = (const uint8_t *)utmp; + + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + int32_t sumi1 = 0; + int32_t sumi2 = 0; + + for (int j = 0; j < QK_K/64; ++j) { + const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32; + + q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; + q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); + q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); + + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + sumi1 += vaddvq_s32(p1) * scales[2*j+0]; + + q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; + q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); + q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); + + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + + sumi2 += vaddvq_s32(p2) * scales[2*j+1]; + } + + sumf += d * (sumi1 + sumi2); + + } + + *s = sumf; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + + +#ifdef __ARM_NEON + const uint8x16_t m4b = vdupq_n_u8(0xf); + const uint8x16_t mone = vdupq_n_u8(1); + const uint8x16_t mtwo = vdupq_n_u8(2); + const int32x4_t mzero = vdupq_n_s32(0); + + ggml_int8x16x4_t q5bytes; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8); + const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8)); + const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)), + vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins))); + int32_t sumi_mins = vaddvq_s32(prod); + + const uint8_t * scales = (const uint8_t *)utmp; + + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); + + ggml_uint8x16x4_t q5h; + + int32_t sumi = 0; + + for (int j = 0; j < QK_K/64; ++j) { + + const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32; + const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; + + q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4); + q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4); + q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3); + q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3); + qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2); + qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2); + + q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0])); + q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1])); + q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2])); + q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3])); + + sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++; + sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++; + } + + sumf += d * sumi - dmin * sumi_mins; + } + + *s = sumf; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); +#ifdef __ARM_FEATURE_MATMUL_INT8 + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q6_K * GGML_RESTRICT x0 = x; + const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx); + const block_q8_K * GGML_RESTRICT y0 = y; + const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by); + + float32x4_t vfsum = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) { + const uint8_t * GGML_RESTRICT ql0 = x0->ql; + const uint8_t * GGML_RESTRICT ql1 = x1->ql; + const uint8_t * GGML_RESTRICT qh0 = x0->qh; + const uint8_t * GGML_RESTRICT qh1 = x1->qh; + const int8_t * GGML_RESTRICT qy0 = y0->qs; + const int8_t * GGML_RESTRICT qy1 = y1->qs; + + const uint8x16_t mone = vdupq_n_u8(0x30); + const uint8x16_t m4b = vdupq_n_u8(0x0f); + + int32x4_t visum = vdupq_n_s32(0); + + // process 8 blocks per iteration, totally 16 blocks + for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) { + int8x16_t vx0[8], vx1[8]; + + // de-quantize vx0[8] + { + const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0); + const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0); + + uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4)); + uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4)); + uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2)); + uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2)); + + vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0)); + vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1)); + vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2)); + vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3)); + + q6h_0 = vandq_u8(mone, qh_bits.val[0]); + q6h_1 = vandq_u8(mone, qh_bits.val[1]); + q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2)); + q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2)); + + vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0)); + vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1)); + vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2)); + vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3)); + } + + // de-quantize vx1[8] + { + const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1); + const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1); + + uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4)); + uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4)); + uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2)); + uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2)); + + vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0)); + vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1)); + vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2)); + vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3)); + + q6h_0 = vandq_u8(mone, qh_bits.val[0]); + q6h_1 = vandq_u8(mone, qh_bits.val[1]); + q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2)); + q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2)); + + vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0)); + vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1)); + vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2)); + vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3)); + } + + // process 16 elements (one block with same scale) per iteration + // - vx = concat(ql, qh) - 32 + // - r1,r2,r3,r4 = smmla(vx, vy) + for (int k = 0; k < 8; ++k) { + const int blk = j * 8 + k; + + const int8x16_t vy0 = vld1q_s8(qy0); + const int8x16_t vy1 = vld1q_s8(qy1); + qy0 += 16; + qy1 += 16; + + const int32x4_t block_scale = { + x0->scales[blk], + x0->scales[blk], + x1->scales[blk], + x1->scales[blk], + }; + + // calculate four results at once with outer product + const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k]))); + const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k]))); + const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1))); + const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1))); + int32x4_t vr = vdupq_n_s32(0); + vr = vmmlaq_s32(vr, vx_l, vy_l); + vr = vmmlaq_s32(vr, vx_h, vy_h); + + // apply block scale, will NOT overflow + // block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits + visum = vmlaq_s32(visum, vr, block_scale); + } + } + + // adjust bias, apply superblock scale + { + int32_t bias[4]; +#ifdef __ARM_FEATURE_SVE + const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8); + const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8); + const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums); + const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8); + const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums); + const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8); + const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales)); + const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8)); + const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales)); + const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8)); + const svint64_t zero = svdup_n_s64(0); + bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0), + svdot_s64(zero, y0_q8sums_1, x0_q6scales_1))); + bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0), + svdot_s64(zero, y1_q8sums_1, x0_q6scales_1))); + bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0), + svdot_s64(zero, y0_q8sums_1, x1_q6scales_1))); + bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0), + svdot_s64(zero, y1_q8sums_1, x1_q6scales_1))); +#else + // NEON doesn't support int16 dot product, fallback to separated mul and add + const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums); + const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums); + + int8x16_t scales_s8 = vld1q_s8(x0->scales); + const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}}; + scales_s8 = vld1q_s8(x1->scales); + const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}}; + + int32x4_t prod; + prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])), + vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))), + vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])), + vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1])))); + bias[0] = vaddvq_s32(prod); + prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])), + vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))), + vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])), + vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1])))); + bias[1] = vaddvq_s32(prod); + prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])), + vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))), + vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])), + vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1])))); + bias[2] = vaddvq_s32(prod); + prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])), + vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))), + vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])), + vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1])))); + bias[3] = vaddvq_s32(prod); + +#endif + const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32); + + const float32x4_t superblock_scale = { + GGML_CPU_FP16_TO_FP32(x0->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y1->d, + }; + + visum = vsubq_s32(visum, vibias); + vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale); + } + } + + // vfsum = ABCD -> ACBD + // AC -> s, BD -> (s+bs) + vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2)); + vst1_f32(s, vget_low_f32 (vfsum)); + vst1_f32(s + bs, vget_high_f32(vfsum)); + + return; + } +#endif + +#ifdef __ARM_FEATURE_SVE + const int vector_length = ggml_cpu_get_sve_cnt()*8; + float sum = 0; + svuint8_t m4b = svdup_n_u8(0xf); + svint32_t vzero = svdup_n_s32(0); + svuint8_t mone = svdup_n_u8(0x30); + svint8_t q6bytes_1, q6bytes_2, q6bytes_3, q6bytes_4; + svuint8_t q6h_1, q6h_2, q6h_3, q6h_4; + + for (int i = 0; i < nb; ++i) { + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const int8_t * GGML_RESTRICT scale = x[i].scales; + + const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8); + const svint16_t q8sums_1 = svld1_s16(pg16_8, y[i].bsums); + const svint16_t q8sums_2 = svld1_s16(pg16_8, y[i].bsums + 8); + const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale)); + const svint16_t q6scales_2 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale + 8)); + const svint64_t prod = svdup_n_s64(0); + int32_t isum_mins = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(prod, q8sums_1, q6scales_1), + svdot_s64(prod, q8sums_2, q6scales_2))); + int32_t isum = 0; + + switch (vector_length) { + case 128: + { + const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4); + const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16); + svint32_t isum_tmp = svdup_n_s32(0); + for (int j = 0; j < QK_K/128; ++j) { + svuint8_t qhbits_1 = svld1_u8(pg8_16, qh); + svuint8_t qhbits_2 = svld1_u8(pg8_16, qh+16); + qh += 32; + svuint8_t q6bits_1 = svld1_u8(pg8_16, q6); + svuint8_t q6bits_2 = svld1_u8(pg8_16, q6+16); + svuint8_t q6bits_3 = svld1_u8(pg8_16, q6+32); + svuint8_t q6bits_4 = svld1_u8(pg8_16, q6+48); + q6 += 64; + svint8_t q8bytes_1 = svld1_s8(pg8_16, q8); + svint8_t q8bytes_2 = svld1_s8(pg8_16, q8+16); + svint8_t q8bytes_3 = svld1_s8(pg8_16, q8+32); + svint8_t q8bytes_4 = svld1_s8(pg8_16, q8+48); + q8 += 64; + + q6h_1 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 4)); + q6h_2 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 4)); + q6h_3 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 2)); + q6h_4 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 2)); + q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_1, m4b), q6h_1)); + q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_2, m4b), q6h_2)); + q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_3, m4b), q6h_3)); + q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_4, m4b), q6h_4)); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]); + + scale += 4; + q8bytes_1 = svld1_s8(pg8_16, q8); + q8bytes_2 = svld1_s8(pg8_16, q8+16); + q8bytes_3 = svld1_s8(pg8_16, q8+32); + q8bytes_4 = svld1_s8(pg8_16, q8+48); + q8 += 64; + + q6h_1 = svand_u8_x(pg16_8, mone, qhbits_1); + q6h_2 = svand_u8_x(pg16_8, mone, qhbits_2); + q6h_3 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_1, 2)); + q6h_4 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_2, 2)); + q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_1, 4), q6h_1)); + q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_2, 4), q6h_2)); + q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_3, 4), q6h_3)); + q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_4, 4), q6h_4)); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]); + scale += 4; + } + isum += svaddv_s32(pg32_4, isum_tmp); + sum += d_all * y[i].d * (isum - 32 * isum_mins); + } + break; + case 256: + case 512: + { + const svbool_t pg8_2 = svptrue_pat_b8(SV_VL2); + const svbool_t pg32_8 = svptrue_pat_b32(SV_VL8); + const svbool_t pg8_32 = svptrue_pat_b8(SV_VL32); + svint32_t isum_tmp = svdup_n_s32(0); + for (int j = 0; j < QK_K/128; j++) { + svuint8_t qhbits_1 = svld1_u8(pg8_32, qh); + qh += 32; + svuint8_t q6bits_1 = svld1_u8(pg8_32, q6); + svuint8_t q6bits_2 = svld1_u8(pg8_32, q6+32); + q6 += 64; + svint8_t q8bytes_1 = svld1_s8(pg8_32, q8); + svint8_t q8bytes_2 = svld1_s8(pg8_32, q8+32); + svint8_t q8bytes_3 = svld1_s8(pg8_32, q8+64); + svint8_t q8bytes_4 = svld1_s8(pg8_32, q8+96); + q8 += 128; + q6h_1 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 4)); + q6h_2 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 2)); + q6h_3 = svand_u8_x(pg8_32, mone, qhbits_1); + q6h_4 = svand_u8_x(pg8_32, mone, svlsr_n_u8_x(pg8_32, qhbits_1, 2)); + q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_1, m4b), q6h_1)); + q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_2, m4b), q6h_2)); + q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_1, 4), q6h_3)); + q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_2, 4), q6h_4)); + + svint8_t scale_lane_1_tmp = svld1_s8(pg8_2, scale); + scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp); + scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp); + svint8_t scale_lane_2_tmp = svld1_s8(pg8_2, scale+2); + scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp); + scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp); + svint8_t scale_lane_3_tmp = svld1_s8(pg8_2, scale+4); + scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp); + scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp); + svint8_t scale_lane_4_tmp = svld1_s8(pg8_2, scale+6); + scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp); + scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp); + svint32_t scale_lane_1 = svunpklo_s32(svunpklo_s16(scale_lane_1_tmp)); + svint32_t scale_lane_2 = svunpklo_s32(svunpklo_s16(scale_lane_2_tmp)); + svint32_t scale_lane_3 = svunpklo_s32(svunpklo_s16(scale_lane_3_tmp)); + svint32_t scale_lane_4 = svunpklo_s32(svunpklo_s16(scale_lane_4_tmp)); + + isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale_lane_1); + isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale_lane_2); + isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale_lane_3); + isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale_lane_4); + scale += 8; + } + isum += svaddv_s32(pg32_8, isum_tmp); + sum += d_all * y[i].d * (isum - 32 * isum_mins); + } + break; + default: + assert(false && "Unsupported vector length"); + break; + } + } + + *s = sum; + +#elif __ARM_NEON + float sum = 0; + + const uint8x16_t m4b = vdupq_n_u8(0xF); + const int32x4_t vzero = vdupq_n_s32(0); + //const int8x16_t m32s = vdupq_n_s8(32); + + const uint8x16_t mone = vdupq_n_u8(3); + + ggml_int8x16x4_t q6bytes; + ggml_uint8x16x4_t q6h; + + for (int i = 0; i < nb; ++i) { + + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const int8_t * GGML_RESTRICT scale = x[i].scales; + + const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); + const int8x16_t scales = vld1q_s8(scale); + const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}}; + + const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])), + vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))), + vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])), + vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1])))); + int32_t isum_mins = vaddvq_s32(prod); + + int32_t isum = 0; + + for (int j = 0; j < QK_K/128; ++j) { + + ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32; + ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64; + ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; + + q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4); + q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4); + uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2); + q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + shifted = vshrq_n_u8(qhbits.val[1], 2); + q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + + //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s); + //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s); + //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s); + //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s); + q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])); + q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])); + q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])); + q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])); + + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; + + scale += 4; + + q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; + + shifted = vshrq_n_u8(qhbits.val[0], 4); + q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + shifted = vshrq_n_u8(qhbits.val[1], 4); + q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + shifted = vshrq_n_u8(qhbits.val[0], 6); + q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + shifted = vshrq_n_u8(qhbits.val[1], 6); + q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + + //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s); + //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s); + //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s); + //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s); + q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])); + q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])); + q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])); + q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])); + + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; + scale += 4; + } + //sum += isum * d_all * y[i].d; + sum += d_all * y[i].d * (isum - 32 * isum_mins); + + } + *s = sum; +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +#if defined (__ARM_NEON) +static const int8_t keven_signs_q2xs[1024] = { + 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, + 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, + 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, + 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, + 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, + 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, + 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, + 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, + 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, + 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, + 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, + 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, + 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, + 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, + 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, + 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, + 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, + 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, + 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, + 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, + 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, + 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, + 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, + 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, + 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, + 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, + 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, + 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, + 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, + 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, + 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, + 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, +}; +#endif + +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + ggml_int8x16x4_t q2u; + ggml_int8x16x4_t q2s; + ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + float sumf1 = 0, sumf2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; + q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1]))); + q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3]))); + q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9]))); + q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11]))); + q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127)))); + q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127)))); + q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 7) & 127)))); + q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127)))); + q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]); + q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]); + q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]); + q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]); + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]); + sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28)); + sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28)); + } + sumf += d*(sumf1 + sumf2); + } + *s = 0.25f * sumf; + +#else + + uint32_t aux32[2]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(aux32, q2, 2*sizeof(uint32_t)); + q2 += 4; + const uint32_t ls = 2*(aux32[1] >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + ggml_int8x16x4_t q2u; + ggml_int8x16x4_t q2s; + ggml_int8x16x4_t q8b; + + int32x4x4_t scales32; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint8x8_t scales8 = vld1_u8(x[i].scales); + const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf)); + const uint8x8_t scales_h = vshr_n_u8(scales8, 4); + uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h)); + scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1)); + const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales)); + const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales)); + scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1))); + scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1))); + scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2))); + scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2))); + int32x4_t sumi = vdupq_n_s32(0); + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511)))); + q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511)))); + q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511)))); + q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511)))); + q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9)))); + q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9)))); + q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9)))); + q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9)))); + q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]); + q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]); + q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]); + q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]); + const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]); + const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]); + const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]); + const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]); + const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4)); + sumi = vmlaq_s32(sumi, p, scales32.val[ib64]); + q2 += 8; + } + sumf += d*vaddvq_s32(sumi); + } + *s = 0.125f * sumf; + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; + const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls1; + sumi = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls2; + q2 += 4; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + + const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1); + const uint8x16_t mask2 = vld1q_u8(k_mask2); + const uint8x16_t m1 = vdupq_n_u8(1); + const int32x4_t vzero = vdupq_n_s32(0); + + uint8x16x2_t vs; + ggml_int8x16x4_t q2s; + ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + int sumi1 = 0, sumi2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))), + vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300))))); + q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))), + vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300))))); + q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))), + vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300))))); + q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))), + vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300))))); + qs += 8; + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16))); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vceqq_u8(vs.val[0], mask2); + vs.val[1] = vceqq_u8(vs.val[1], mask2); + + q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]); + q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]); + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16))); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vceqq_u8(vs.val[0], mask2); + vs.val[1] = vceqq_u8(vs.val[1], mask2); + + signs += 4; + + q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]); + q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]); + + const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]); + const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]); + const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]); + const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]); + + sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf)); + sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4)); + sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf)); + sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4)); + } + sumf += d*(sumi1 + sumi2); + } + + *s = 0.125f * sumf; + +#else + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint8_t * signs = qs + QK_K/8; + + int bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf); + int ls2 = 1 + 2*(x[i].scales[ib32] >> 4); + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += ls1 * sumi1 + ls2 * sumi2; + qs += 4; + signs += 4; + } + + sumf += d * bsum; + } + + *s = 0.125f * sumf; + +#endif + +} + +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + ggml_int8x16x4_t q3s; + ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + float sumf1 = 0, sumf2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t); + const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]); + const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]); + const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]); + const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]); + q3 += 16; + q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127)))); + q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127)))); + q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127)))); + q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127)))); + q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0)); + q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1)); + q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2)); + q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3)); + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); + sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28)); + sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28)); + } + sumf += d*(sumf1 + sumf2); + } + *s = 0.5f * sumf; + +#else + + uint32_t aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); + const uint32_t ls = 2*(aux32 >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]); + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + q3 += 8; + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +#endif +} + +void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + typedef union { + uint16x8_t vec_index; + uint16_t index[8]; + } vec_index_t; + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + + static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1}; + + const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1); + const uint8x16_t mask2 = vld1q_u8(k_mask2); + + const int16x8_t hshift = vld1q_s16(k_shift); + const uint16x8_t m256 = vdupq_n_u16(256); + const uint8x16_t m1 = vdupq_n_u8(1); + + uint8x16x2_t vs; + ggml_int8x16x4_t q3s; + ggml_int8x16x4_t q8b; + vec_index_t idx; + + uint32_t scales32[2]; + const uint8_t * scales8 = (const uint8_t *)scales32; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(scales32, x[i].scales, 4); + scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; + scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101; + + int sumi1 = 0, sumi2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + + const uint8x16_t idx_l = vld1q_u8(qs); qs += 16; + idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256)); + const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], + iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]); + const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], + iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); + idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256)); + const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], + iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]); + const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], + iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); + + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16))); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); + vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); + + q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0)); + q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1)); + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16))); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); + vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); + + signs += 4; + + q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2)); + q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3)); + + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); + + sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0]; + sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4]; + } + sumf += d*(sumi1 + sumi2); + } + *s = sumf; + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint8_t * GGML_RESTRICT signs = x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; + const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls1; + sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls2; + } + sumf += d * bsum; + } + *s = sumf; +#endif +} + +void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __ARM_NEON + + ggml_int8x16x4_t q1b; + ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + int sumi1 = 0, sumi2 = 0, sumi3 = 0; + + for (int ib = 0; ib < QK_K/32; ib += 2) { + + q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700))))); + q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700))))); + q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700))))); + q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700))))); + qs += 8; + + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]); + + const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; + const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; + sumi1 += vaddvq_s32(p1) * ls1; + sumi2 += vaddvq_s32(p2) * ls2; + sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1) + + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1); + + } + + sumf += y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3); + } + + *s = sumf; + +#else + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + int sumi = 0, sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + const int ls = 2*((qh[ib] >> 12) & 7) + 1; + const int delta = qh[ib] & 0x8000 ? -1 : 1; + int lsum = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8))); + for (int j = 0; j < 8; ++j) { + lsum += q8[j] * grid[j]; + } + q8 += 8; + } + sumi += ls * lsum; + sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]); + qs += 4; + } + + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + } + + *s = sumf; + +#endif +} + +void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_m * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + iq1m_scale_t scale; + +#if defined __ARM_NEON + const int32x4_t mask = vdupq_n_s32(0x7); + const int32x4_t mone = vdupq_n_s32(1); + const int32x4_t mzero = vdupq_n_s32(0); + + ggml_int8x16x4_t deltas; + deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1)); + deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1)); + deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1)); + deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1)); + + ggml_int8x16x4_t q1b; + ggml_int8x16x4_t q8b; + + uint32_t aux32; + const uint8_t * aux8 = (const uint8_t *)&aux32; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + int32x4_t sumi1 = mzero; + int32x4_t sumi2 = mzero; + + for (int ib = 0; ib < QK_K/32; ib += 2) { + + q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700))))); + q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700))))); + q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700))))); + q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700))))); + + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + + const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1])); + const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3])); + const int32x4_t p12 = vpaddq_s32(p1, p2); + + const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that + aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202); + + const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1])); + const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3])); + const int32x4_t p34 = vpaddq_s32(p3, p4); + + int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9); + + scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone); + + sumi1 = vmlaq_s32(sumi1, scales_4, p12); + sumi2 = vmlaq_s32(sumi2, scales_4, p34); + + qs += 8; qh += 4; + + } + + sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); + } + + *s = sumf; + +#else + + int sum1[2], sum2[2], delta[4]; + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + int sumi1 = 0, sumi2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + delta[0] = qh[0] & 0x08 ? -1 : 1; + delta[1] = qh[0] & 0x80 ? -1 : 1; + delta[2] = qh[1] & 0x08 ? -1 : 1; + delta[3] = qh[1] & 0x80 ? -1 : 1; + sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700))); + int lsum1 = 0, lsum2 = 0; + for (int j = 0; j < 8; ++j) { + lsum1 += q8[j] * grid[j]; + lsum2 += q8[j]; + } + q8 += 8; + sum1[l/2] += lsum1; + sum2[l/2] += lsum2*delta[l]; + } + + const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1; + const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1; + + sumi1 += sum1[0] * ls1 + sum1[1] * ls2; + sumi2 += sum2[0] * ls1 + sum2[1] * ls2; + qs += 4; + qh += 2; + } + + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + } + + *s = sumf; + +#endif +} + +void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + int ib = 0; + float sumf = 0; + +#if defined __ARM_NEON + const int8x16_t values = vld1q_s8(kvalues_iq4nl); + const uint8x16_t m4b = vdupq_n_u8(0x0f); + uint8x16x2_t q4bits; + int8x16x4_t q4b; + int8x16x4_t q8b; + int32x4_t prod_1, prod_2; + + for (; ib + 1 < nb; ib += 2) { + + q4bits.val[0] = vld1q_u8(x[ib + 0].qs); + q4bits.val[1] = vld1q_u8(x[ib + 1].qs); + q8b.val[0] = vld1q_s8(y[ib + 0].qs); + q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16); + q8b.val[2] = vld1q_s8(y[ib + 1].qs); + q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16); + + q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b)); + q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); + q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b)); + q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); + + prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); + prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); + + sumf += + GGML_CPU_FP16_TO_FP32(x[ib+0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) + + GGML_CPU_FP16_TO_FP32(x[ib+1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2); + } + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +} + +void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK_K == 0); + + const block_iq4_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __ARM_NEON + const int8x16_t values = vld1q_s8(kvalues_iq4nl); + const uint8x16_t m4b = vdupq_n_u8(0x0f); + ggml_uint8x16x2_t q4bits; + ggml_int8x16x4_t q4b; + ggml_int8x16x4_t q8b; + int32x4_t prod_1, prod_2; + + float sumf = 0; + + for (int ibl = 0; ibl < nb; ++ibl) { + + const int8_t * q8 = y[ibl].qs; + const uint8_t * q4 = x[ibl].qs; + uint16_t h = x[ibl].scales_h; + + int sumi1 = 0, sumi2 = 0; + for (int ib = 0; ib < QK_K/64; ++ib) { + + q4bits = ggml_vld1q_u8_x2(q4); q4 += 32; + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + + q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b)); + q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); + q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b)); + q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); + + prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); + prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); + + int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32; + int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32; + h >>= 4; + sumi1 += vaddvq_s32(prod_1) * ls1; + sumi2 += vaddvq_s32(prod_2) * ls2; + + } + + sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); + } + + *s = sumf; + +#else + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + uint16_t h = x[ibl].scales_h; + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + for (int ib = 0; ib < QK_K/32; ib += 2) { + const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30); + const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30); + h >>= 4; + const float d1 = d4d8*(ls1 - 32); + const float d2 = d4d8*(ls2 - 32); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d1 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + sumi1 = sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d2 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + } + } + *s = sumf; +#endif +} + diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp new file mode 100644 index 0000000000000..2f8bc9e251735 --- /dev/null +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -0,0 +1,2163 @@ +#define GGML_COMMON_IMPL_CPP +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" +#include "ggml-backend-impl.h" + +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "ggml-cpu-impl.h" +#include "simd-mappings.h" +#include "traits.h" + +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GGML_CPU_CLANG_WORKAROUND +#include "../../repack.h" + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Woverlength-strings" +#endif + +#define UNUSED GGML_UNUSED + +void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; + +#if defined(__ARM_NEON) + float32x4_t srcv[4][8]; + float id[4]; + + for (int i = 0; i < nb; i++) { + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int row_iter = 0; row_iter < 4; row_iter++) { + for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]); + + for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]); + for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]); + for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); + } + + for (int j = 0; j < 8; j++) { + float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]); + int32x4_t vi = vcvtnq_s32_f32(v); + y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[1][j], id[1]); + vi = vcvtnq_s32_f32(v); + y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0); + y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1); + y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2); + y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[2][j], id[2]); + vi = vcvtnq_s32_f32(v); + y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0); + y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1); + y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2); + y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[3][j], id[3]); + vi = vcvtnq_s32_f32(v); + y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0); + y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1); + y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2); + y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3); + } + } +#else + // scalar + const int blck_size_interleave = 4; + float srcv[4][QK8_0]; + float id[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + amax = MAX(amax, fabsf(srcv[row_iter][j])); + } + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); + } + + for (int j = 0; j < QK8_0 * 4; j++) { + int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; + int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; + src_offset += (j % blck_size_interleave); + + float x0 = srcv[src_id][src_offset] * id[src_id]; + y[i].qs[j] = roundf(x0); + } + } +#endif +} + +void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; + +#if defined(__ARM_NEON) + float32x4_t srcv[4][8]; + float id[4]; + + for (int i = 0; i < nb; i++) { + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int row_iter = 0; row_iter < 4; row_iter++) { + for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]); + + for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]); + for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]); + for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); + } + + for (int j = 0; j < 4; j++) { + float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]); + int32x4_t vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3); + v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[1][2 * j], id[1]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3); + v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[2][2 * j], id[2]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3); + v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3); + + v = vmulq_n_f32(srcv[3][2 * j], id[3]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3); + v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]); + vi = vcvtnq_s32_f32(v); + y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0); + y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1); + y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2); + y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3); + } + } + +#else + // scalar + const int blck_size_interleave = 8; + float srcv[4][QK8_0]; + float id[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + amax = MAX(amax, fabsf(srcv[row_iter][j])); + } + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); + } + + for (int j = 0; j < QK8_0 * 4; j++) { + int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; + int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; + src_offset += (j % blck_size_interleave); + + float x0 = srcv[src_id][src_offset] * id[src_id]; + y[i].qs[j] = roundf(x0); + } + } +#endif +} + +void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx; + + for (int c = 0; c < nc; c += ncols_interleaved) { + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + float32x4_t acc = vdupq_n_f32(0); + for (int b = 0; b < nb; b++) { + int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs); + int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16); + int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32); + int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48); + float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d); + + int8x16_t a0 = vld1q_s8(a_ptr->qs); + int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2); + float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d); + + int32x4_t ret = vdupq_n_s32(0); + + ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0); + ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1); + ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2); + ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3); + + ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0); + ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1); + ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2); + ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3); + + acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4), + vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd))); + a_ptr++; + b_ptr++; + } + vst1q_f32(s, acc); + s += ncols_interleaved; + } + return; +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } +} + +void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx; + + for (int c = 0; c < nc; c += ncols_interleaved) { + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + float32x4_t acc = vdupq_n_f32(0); + for (int b = 0; b < nb; b++) { + int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs); + int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16); + int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32); + int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48); + float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d); + + int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs); + int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1); + int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2); + int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3); + float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d); + + int32x4_t ret0 = vdupq_n_s32(0); + int32x4_t ret1 = vdupq_n_s32(0); + + ret0 = vdotq_s32(ret0, b0 << 4, a0); + ret1 = vdotq_s32(ret1, b1 << 4, a0); + ret0 = vdotq_s32(ret0, b2 << 4, a1); + ret1 = vdotq_s32(ret1, b3 << 4, a1); + + ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2); + ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2); + ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3); + ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3); + + int32x4_t ret = vpaddq_s32(ret0, ret1); + + acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4), + vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd))); + a_ptr++; + b_ptr++; + } + vst1q_f32(s, acc); + s += ncols_interleaved; + } + return; +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } +} + +void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) + if (ggml_cpu_get_sve_cnt() == QK8_0) { + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + + __asm__ __volatile__( + "ptrue p0.b\n" + "add %x[b_ptr], %x[b_ptr], #0x10\n" + "1:" // Column loop + "add x22, %x[a_ptr], #0x2\n" + "mov z31.b, #0x0\n" + "mov x21, %x[nb]\n" + "2:" // Block loop + "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n" + "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n" + "mov z28.s, #0x0\n" + "mov z27.s, #0x0\n" + "ld1rd { z26.d }, p0/Z, [x22]\n" + "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n" + "sub x20, x22, #0x2\n" + "sub x21, x21, #0x1\n" + "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n" + "ld1rd { z23.d }, p0/Z, [x22, #8]\n" + "lsl z22.b, z30.b, #0x4\n" + "lsl z16.b, z29.b, #0x4\n" + "and z30.b, z30.b, #0xf0\n" + "and z29.b, z29.b, #0xf0\n" + "ld1rd { z21.d }, p0/Z, [x22, #16]\n" + "ld1rd { z20.d }, p0/Z, [x22, #24]\n" + "lsl z19.b, z25.b, #0x4\n" + "and z25.b, z25.b, #0xf0\n" + "ld1rh { z17.h }, p0/Z, [x20]\n" + "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n" + "sdot z28.s, z22.b, z26.b\n" + "sdot z27.s, z16.b, z26.b\n" + "lsl z16.b, z24.b, #0x4\n" + "add x22, x22, #0x22\n" + "and z24.b, z24.b, #0xf0\n" + "add %x[b_ptr], %x[b_ptr], #0x90\n" + "fcvt z17.s, p0/m, z17.h\n" + "fcvt z18.s, p0/m, z18.h\n" + "sdot z28.s, z19.b, z23.b\n" + "sdot z27.s, z16.b, z23.b\n" + "fmul z18.s, z18.s, z17.s\n" + "sdot z28.s, z30.b, z21.b\n" + "sdot z27.s, z29.b, z21.b\n" + "sdot z28.s, z25.b, z20.b\n" + "sdot z27.s, z24.b, z20.b\n" + "uzp1 z17.s, z28.s, z27.s\n" + "uzp2 z16.s, z28.s, z27.s\n" + "add z17.s, z17.s, z16.s\n" + "asr z17.s, z17.s, #0x4\n" + "scvtf z17.s, p0/m, z17.s\n" + "fmla z31.s, p0/M, z17.s, z18.s\n" + "cbnz x21, 2b\n" + "sub %x[nc], %x[nc], #0x8\n" + "st1w { z31.s }, p0, [%x[res_ptr]]\n" + "add %x[res_ptr], %x[res_ptr], #0x20\n" + "cbnz %x[nc], 1b\n" + : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) + : [a_ptr] "r" (a_ptr), [nb] "r" (nb) + : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); + return; + } +#endif // #if defined(__ARM_FEATURE_SVE) + +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) + { + float sumf[8]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } + } +} + +void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + float * res_ptr = s; + + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + + float32x4_t sumf = vdupq_n_f32(0); + for (int l = 0; l < nb; l++) { + uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0); + uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16); + uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32); + uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48); + + int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4); + int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F); + int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4); + int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F); + int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4); + int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F); + int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4); + int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F); + + int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0); + int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16); + + int32x4_t sumi = vdupq_n_s32(0); + sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0); + sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0); + sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1); + sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1); + sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2); + sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2); + sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3); + sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3); + + float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d)); + float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d)); + float32x4_t d = a_d * b_d; + + sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi)); + } + + vst1q_f32(res_ptr + x * 4, sumf); + } + return; +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) + { + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; + const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } + } +} + +void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); + + __asm__ __volatile__( + "mov x10, %x[nr]\n" + "mov x9, #0x88\n" + "cmp x10, #0x10\n" + "mul x9, %x[nb], x9\n" + "blt 4f\n" + "1:" // Row loop + "add x28, %x[b_ptr], #0x8\n" + "mov x27, %x[nc]\n" + "add x26, %x[res_ptr], %x[res_stride], LSL #4\n" + "2:" // Column loop + "add x25, %x[a_ptr], #0x8\n" + "movi v15.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "mov x24, %x[nb]\n" + "add x23, x25, x9\n" + "movi v18.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "add x22, x23, x9\n" + "movi v11.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "add x21, x22, x9\n" + "movi v23.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v7.16b, #0x0\n" + "movi v0.16b, #0x0\n" + "movi v4.16b, #0x0\n" + "movi v5.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v8.16b, #0x0\n" + "movi v1.16b, #0x0\n" + "3:" // Block loop + "ldr q3, [x28, #0x0]\n" + "ldr q31, [x25, #0x0]\n" + "movi v28.16b, #0x4\n" + "movi v10.4s, #0x0\n" + "ldr q22, [x28, #0x10]\n" + "ldr q6, [x25, #0x10]\n" + "movi v29.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "ldr q27, [x28, #0x20]\n" + "ldr q30, [x28, #0x30]\n" + "movi v20.4s, #0x0\n" + "movi v24.16b, #0xf0\n" + "ldr d2, [x25, #-0x8]\n" + "ldr d26, [x23, #-0x8]\n" + "sshl v12.16b, v3.16b, v28.16b\n" + "sub x20, x28, #0x8\n" + "ldr d17, [x20, #0x0]\n" + "and v3.16b, v3.16b, v24.16b\n" + "subs x24, x24, #0x1\n" + "add x28, x28, #0x48\n" + ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n" + ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n" + ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n" + ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n" + "sshl v31.16b, v22.16b, v28.16b\n" + "and v22.16b, v22.16b, v24.16b\n" + "fcvtl v17.4s, v17.4h\n" + "fcvtl v2.4s, v2.4h\n" + "fcvtl v26.4s, v26.4h\n" + ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n" + ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n" + ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n" + ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n" + "sshl v6.16b, v27.16b, v28.16b\n" + "sshl v28.16b, v30.16b, v28.16b\n" + "and v27.16b, v27.16b, v24.16b\n" + "and v30.16b, v30.16b, v24.16b\n" + "ldr q24, [x25, #0x20]\n" + ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n" + ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n" + ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n" + ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x30]\n" + ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n" + ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n" + ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n" + ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x40]\n" + ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n" + ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n" + ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n" + ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x50]\n" + ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n" + ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n" + ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n" + ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x60]\n" + ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n" + ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n" + ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n" + ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n" + "ldr q24, [x25, #0x70]\n" + "add x25, x25, #0x88\n" + ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n" + ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n" + ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n" + ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n" + "fmul v24.4s, v17.4s, v2.s[0]\n" + "scvtf v10.4s, v10.4s, #0x4\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "scvtf v9.4s, v9.4s, #0x4\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "fmla v15.4s, v10.4s, v24.4s\n" + "ldr q24, [x23, #0x0]\n" + "fmul v10.4s, v17.4s, v2.s[1]\n" + "fmla v19.4s, v29.4s, v10.4s\n" + "ldr q10, [x23, #0x10]\n" + "fmul v29.4s, v17.4s, v2.s[2]\n" + "fmul v2.4s, v17.4s, v2.s[3]\n" + "fmla v18.4s, v9.4s, v29.4s\n" + "movi v9.4s, #0x0\n" + "movi v29.4s, #0x0\n" + ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n" + ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n" + "fmla v14.4s, v20.4s, v2.4s\n" + "movi v20.4s, #0x0\n" + "movi v2.4s, #0x0\n" + ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n" + ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n" + "ldr q24, [x23, #0x20]\n" + ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n" + ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n" + ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n" + ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n" + "ldr q10, [x23, #0x30]\n" + ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n" + ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n" + ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n" + ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n" + "ldr q24, [x23, #0x40]\n" + ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n" + ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n" + ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n" + ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n" + "ldr q10, [x23, #0x50]\n" + ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n" + ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n" + ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n" + ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n" + "ldr q24, [x23, #0x60]\n" + ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n" + ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n" + ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n" + ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n" + "ldr q10, [x23, #0x70]\n" + "add x23, x23, #0x88\n" + ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n" + ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n" + ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n" + ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x0]\n" + ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n" + ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n" + ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n" + ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n" + "fmul v10.4s, v17.4s, v26.s[0]\n" + "scvtf v9.4s, v9.4s, #0x4\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "scvtf v2.4s, v2.4s, #0x4\n" + "fmla v11.4s, v9.4s, v10.4s\n" + "ldr q9, [x22, #0x10]\n" + "fmul v10.4s, v17.4s, v26.s[1]\n" + "fmla v13.4s, v29.4s, v10.4s\n" + "ldr d29, [x22, #-0x8]\n" + "fmul v10.4s, v17.4s, v26.s[2]\n" + "fmul v26.4s, v17.4s, v26.s[3]\n" + "fcvtl v29.4s, v29.4h\n" + "fmla v23.4s, v20.4s, v10.4s\n" + "movi v20.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "fmla v16.4s, v2.4s, v26.4s\n" + "movi v26.4s, #0x0\n" + "movi v2.4s, #0x0\n" + ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n" + ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n" + ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n" + ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x20]\n" + ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n" + ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n" + ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n" + ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n" + "ldr q9, [x22, #0x30]\n" + ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n" + ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n" + ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n" + ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x40]\n" + ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n" + ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n" + ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n" + ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n" + "ldr q9, [x22, #0x50]\n" + ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n" + ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n" + ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n" + ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x60]\n" + ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n" + ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n" + ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n" + ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n" + "ldr q9, [x22, #0x70]\n" + "add x22, x22, #0x88\n" + ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n" + ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n" + ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n" + ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n" + "ldr q24, [x21, #0x0]\n" + ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n" + ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n" + ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n" + ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n" + "fmul v9.4s, v17.4s, v29.s[0]\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "scvtf v10.4s, v10.4s, #0x4\n" + "scvtf v26.4s, v26.4s, #0x4\n" + "scvtf v2.4s, v2.4s, #0x4\n" + "fmla v25.4s, v20.4s, v9.4s\n" + "ldr q9, [x21, #0x10]\n" + "fmul v20.4s, v17.4s, v29.s[1]\n" + "fmla v7.4s, v10.4s, v20.4s\n" + "ldr d20, [x21, #-0x8]\n" + "fmul v10.4s, v17.4s, v29.s[2]\n" + "fmul v29.4s, v17.4s, v29.s[3]\n" + "fcvtl v20.4s, v20.4h\n" + "fmla v0.4s, v26.4s, v10.4s\n" + "movi v26.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "fmla v4.4s, v2.4s, v29.4s\n" + "movi v2.4s, #0x0\n" + "movi v29.4s, #0x0\n" + ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n" + ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n" + ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n" + ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n" + "ldr q12, [x21, #0x20]\n" + "fmul v24.4s, v17.4s, v20.s[0]\n" + ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n" + ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n" + ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n" + ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n" + "ldr q9, [x21, #0x30]\n" + "fmul v31.4s, v17.4s, v20.s[1]\n" + ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n" + ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n" + ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n" + ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n" + "ldr q12, [x21, #0x40]\n" + "fmul v6.4s, v17.4s, v20.s[2]\n" + "fmul v20.4s, v17.4s, v20.s[3]\n" + ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n" + ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n" + ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n" + ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n" + "ldr q9, [x21, #0x50]\n" + ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n" + ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n" + ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n" + ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n" + "ldr q12, [x21, #0x60]\n" + ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n" + ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n" + ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n" + ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n" + "ldr q17, [x21, #0x70]\n" + "add x21, x21, #0x88\n" + ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n" + ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n" + ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n" + ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n" + ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n" + ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n" + ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n" + ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n" + "scvtf v26.4s, v26.4s, #0x4\n" + "scvtf v10.4s, v10.4s, #0x4\n" + "fmla v5.4s, v26.4s, v24.4s\n" + "scvtf v2.4s, v2.4s, #0x4\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "fmla v21.4s, v10.4s, v31.4s\n" + "fmla v8.4s, v2.4s, v6.4s\n" + "fmla v1.4s, v29.4s, v20.4s\n" + "bgt 3b\n" + "mov x20, %x[res_ptr]\n" + "subs x27, x27, #0x4\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "str q15, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q19, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q18, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q14, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q11, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q13, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q23, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q16, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q25, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q7, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q0, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q4, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q5, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q21, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q8, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q1, [x20, #0x0]\n" + "bne 2b\n" + "mov x20, #0x4\n" + "sub x10, x10, #0x10\n" + "cmp x10, #0x10\n" + "mov %x[res_ptr], x26\n" + "madd %x[a_ptr], x20, x9, %x[a_ptr]\n" + "bge 1b\n" + "4:" // Row loop skip + "cbz x10, 9f\n" + "5:" // Row tail: Row loop + "add x24, %x[b_ptr], #0x8\n" + "mov x23, %x[nc]\n" + "add x22, %x[res_ptr], %x[res_stride], LSL #2\n" + "6:" // Row tail: Column loop + "movi v15.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "add x25, %x[a_ptr], #0x8\n" + "mov x21, %x[nb]\n" + "movi v18.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "7:" // Row tail: Block loop + "ldr q7, [x24, #0x0]\n" + "ldr q5, [x25, #0x0]\n" + "movi v9.16b, #0x4\n" + "movi v4.4s, #0x0\n" + "ldr q3, [x24, #0x10]\n" + "ldr q2, [x25, #0x10]\n" + "movi v1.4s, #0x0\n" + "movi v0.4s, #0x0\n" + "ldr q13, [x24, #0x20]\n" + "ldr q31, [x25, #0x20]\n" + "movi v30.4s, #0x0\n" + "movi v29.16b, #0xf0\n" + "ldr q28, [x24, #0x30]\n" + "ldr q27, [x25, #0x30]\n" + "sshl v20.16b, v7.16b, v9.16b\n" + "sub x20, x24, #0x8\n" + "ldr q26, [x25, #0x40]\n" + "ldr q25, [x25, #0x50]\n" + "sshl v17.16b, v3.16b, v9.16b\n" + "and v7.16b, v7.16b, v29.16b\n" + "ldr q24, [x25, #0x60]\n" + "ldr q16, [x25, #0x70]\n" + "sshl v22.16b, v13.16b, v9.16b\n" + "and v3.16b, v3.16b, v29.16b\n" + "ldr d21, [x20, #0x0]\n" + "ldr d12, [x25, #-0x8]\n" + ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n" + ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n" + ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n" + ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n" + "sshl v9.16b, v28.16b, v9.16b\n" + "subs x21, x21, #0x1\n" + "and v13.16b, v13.16b, v29.16b\n" + "and v28.16b, v28.16b, v29.16b\n" + "add x25, x25, #0x88\n" + "add x24, x24, #0x48\n" + "fcvtl v21.4s, v21.4h\n" + "fcvtl v12.4s, v12.4h\n" + ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n" + ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n" + ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n" + ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n" + "fmul v11.4s, v21.4s, v12.s[0]\n" + "fmul v23.4s, v21.4s, v12.s[1]\n" + "fmul v17.4s, v21.4s, v12.s[2]\n" + ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n" + "fmul v6.4s, v21.4s, v12.s[3]\n" + ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n" + ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n" + ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n" + ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n" + ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n" + ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n" + ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n" + ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n" + ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n" + ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n" + ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n" + ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n" + ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n" + ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n" + ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n" + ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n" + ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n" + ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n" + ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n" + ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n" + ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n" + ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n" + ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n" + "scvtf v4.4s, v4.4s, #0x4\n" + "scvtf v1.4s, v1.4s, #0x4\n" + "scvtf v0.4s, v0.4s, #0x4\n" + "fmla v15.4s, v4.4s, v11.4s\n" + "scvtf v30.4s, v30.4s, #0x4\n" + "fmla v19.4s, v1.4s, v23.4s\n" + "fmla v18.4s, v0.4s, v17.4s\n" + "fmla v14.4s, v30.4s, v6.4s\n" + "bgt 7b\n" + "mov x20, %x[res_ptr]\n" + "cmp x10, #0x1\n" + "str q15, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x10, #0x2\n" + "str q19, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x10, #0x3\n" + "str q18, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "str q14, [x20, #0x0]\n" + "8:" // Row tail: Accumulator store skip + "subs x23, x23, #0x4\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "bne 6b\n" + "subs x10, x10, #0x4\n" + "add %x[a_ptr], %x[a_ptr], x9\n" + "mov %x[res_ptr], x22\n" + "bgt 5b\n" + "9:" // Row tail: Row loop skip + : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); + return; +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) + { + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } + } +} + +void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); + + __asm__ __volatile__( + "mov x10, %x[nr]\n" + "mov x9, #0x88\n" + "cmp x10, #0x10\n" + "mul x9, %x[nb], x9\n" + "blt 4f\n" + "1:" // Row loop + "add x28, %x[b_ptr], #0x8\n" + "mov x27, %x[nc]\n" + "add x26, %x[res_ptr], %x[res_stride], LSL #4\n" + "2:" // Column loop + "add x25, %x[a_ptr], #0x8\n" + "movi v2.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "mov x24, %x[nb]\n" + "add x23, x25, x9\n" + "movi v12.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "add x22, x23, x9\n" + "movi v11.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "add x21, x22, x9\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v5.16b, #0x0\n" + "movi v7.16b, #0x0\n" + "movi v4.16b, #0x0\n" + "movi v6.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "3:" // Block loop + "ldr q21, [x28, #0x0]\n" + "ldr q16, [x28, #0x10]\n" + "movi v1.16b, #0x4\n" + "movi v19.4s, #0x0\n" + "ldr q27, [x25, #0x0]\n" + "ldr q15, [x25, #0x10]\n" + "movi v26.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "ldr q29, [x28, #0x20]\n" + "ldr q3, [x28, #0x30]\n" + "movi v17.4s, #0x0\n" + "movi v0.16b, #0xf0\n" + "ldr d20, [x25, #-0x8]\n" + "ldr d9, [x23, #-0x8]\n" + "sshl v8.16b, v21.16b, v1.16b\n" + "sshl v31.16b, v16.16b, v1.16b\n" + "and v21.16b, v21.16b, v0.16b\n" + "and v16.16b, v16.16b, v0.16b\n" + "sub x20, x28, #0x8\n" + "subs x24, x24, #0x1\n" + "add x28, x28, #0x48\n" + ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n" + ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n" + "ldr q27, [x25, #0x20]\n" + ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n" + ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n" + "sshl v15.16b, v29.16b, v1.16b\n" + "sshl v1.16b, v3.16b, v1.16b\n" + "and v29.16b, v29.16b, v0.16b\n" + "and v3.16b, v3.16b, v0.16b\n" + "ldr q0, [x25, #0x30]\n" + "fcvtl v20.4s, v20.4h\n" + ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n" + "fcvtl v9.4s, v9.4h\n" + ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n" + "ldr q27, [x25, #0x40]\n" + ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n" + ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n" + "ldr q0, [x25, #0x50]\n" + ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n" + ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n" + "ldr q27, [x25, #0x60]\n" + ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n" + ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n" + "ldr q0, [x25, #0x70]\n" + "add x25, x25, #0x88\n" + ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n" + ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n" + "ldr d27, [x20, #0x0]\n" + ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n" + ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n" + "fcvtl v27.4s, v27.4h\n" + "uzp1 v0.2d, v19.2d, v26.2d\n" + "uzp2 v26.2d, v19.2d, v26.2d\n" + "fmul v19.4s, v27.4s, v20.s[0]\n" + "scvtf v0.4s, v0.4s, #0x4\n" + "scvtf v26.4s, v26.4s, #0x4\n" + "fmla v2.4s, v0.4s, v19.4s\n" + "ldr q19, [x23, #0x0]\n" + "uzp1 v0.2d, v18.2d, v17.2d\n" + "uzp2 v18.2d, v18.2d, v17.2d\n" + "fmul v17.4s, v27.4s, v20.s[1]\n" + "scvtf v0.4s, v0.4s, #0x4\n" + "scvtf v18.4s, v18.4s, #0x4\n" + "fmla v10.4s, v26.4s, v17.4s\n" + "ldr q17, [x23, #0x10]\n" + "fmul v26.4s, v27.4s, v20.s[2]\n" + "fmul v20.4s, v27.4s, v20.s[3]\n" + "fmla v12.4s, v0.4s, v26.4s\n" + "ldr d0, [x22, #-0x8]\n" + "ldr d26, [x21, #-0x8]\n" + "fcvtl v0.4s, v0.4h\n" + "fmla v28.4s, v18.4s, v20.4s\n" + "movi v20.4s, #0x0\n" + "movi v18.4s, #0x0\n" + ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n" + ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n" + "ldr q19, [x23, #0x20]\n" + "fcvtl v26.4s, v26.4h\n" + ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n" + ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n" + "ldr q19, [x23, #0x40]\n" + ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n" + ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n" + "ldr q19, [x23, #0x60]\n" + ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n" + ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n" + "uzp1 v19.2d, v20.2d, v18.2d\n" + "scvtf v19.4s, v19.4s, #0x4\n" + "uzp2 v20.2d, v20.2d, v18.2d\n" + "fmul v18.4s, v27.4s, v9.s[0]\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "fmla v11.4s, v19.4s, v18.4s\n" + "ldr q18, [x22, #0x0]\n" + "fmul v19.4s, v27.4s, v9.s[1]\n" + "fmla v13.4s, v20.4s, v19.4s\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n" + ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n" + "ldr q17, [x23, #0x30]\n" + ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n" + ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n" + "ldr q17, [x23, #0x50]\n" + ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n" + ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n" + "ldr q17, [x23, #0x70]\n" + "add x23, x23, #0x88\n" + ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n" + ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n" + "uzp1 v17.2d, v19.2d, v20.2d\n" + "scvtf v17.4s, v17.4s, #0x4\n" + "uzp2 v20.2d, v19.2d, v20.2d\n" + "fmul v19.4s, v27.4s, v9.s[2]\n" + "fmul v9.4s, v27.4s, v9.s[3]\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "fmla v22.4s, v17.4s, v19.4s\n" + "ldr q17, [x22, #0x10]\n" + "movi v19.4s, #0x0\n" + ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n" + "fmla v23.4s, v20.4s, v9.4s\n" + "movi v20.4s, #0x0\n" + "movi v9.4s, #0x0\n" + ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n" + "ldr q18, [x22, #0x20]\n" + ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n" + ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n" + ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n" + "ldr q18, [x22, #0x40]\n" + ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n" + ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n" + "ldr q18, [x22, #0x60]\n" + ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n" + ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n" + "movi v18.4s, #0x0\n" + ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n" + "ldr q17, [x22, #0x30]\n" + ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n" + ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n" + "ldr q17, [x22, #0x50]\n" + ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n" + ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n" + "ldr q17, [x22, #0x70]\n" + "add x22, x22, #0x88\n" + ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n" + ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n" + "uzp1 v17.2d, v19.2d, v20.2d\n" + "uzp2 v20.2d, v19.2d, v20.2d\n" + "fmul v19.4s, v27.4s, v0.s[0]\n" + "scvtf v17.4s, v17.4s, #0x4\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "fmla v25.4s, v17.4s, v19.4s\n" + "ldr q19, [x21, #0x0]\n" + "fmul v17.4s, v27.4s, v0.s[1]\n" + "fmla v5.4s, v20.4s, v17.4s\n" + "ldr q17, [x21, #0x10]\n" + "uzp1 v20.2d, v9.2d, v18.2d\n" + "uzp2 v9.2d, v9.2d, v18.2d\n" + "fmul v18.4s, v27.4s, v0.s[2]\n" + "fmul v0.4s, v27.4s, v0.s[3]\n" + "scvtf v20.4s, v20.4s, #0x4\n" + "scvtf v9.4s, v9.4s, #0x4\n" + "fmla v7.4s, v20.4s, v18.4s\n" + "movi v20.4s, #0x0\n" + "movi v18.4s, #0x0\n" + ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n" + ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n" + "ldr q19, [x21, #0x20]\n" + "fmla v4.4s, v9.4s, v0.4s\n" + "movi v9.4s, #0x0\n" + "movi v0.4s, #0x0\n" + ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n" + "fmul v8.4s, v27.4s, v26.s[0]\n" + ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n" + "ldr q17, [x21, #0x30]\n" + ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n" + "fmul v31.4s, v27.4s, v26.s[1]\n" + ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n" + "ldr q19, [x21, #0x40]\n" + ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n" + "fmul v15.4s, v27.4s, v26.s[2]\n" + "fmul v27.4s, v27.4s, v26.s[3]\n" + ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n" + "ldr q1, [x21, #0x50]\n" + ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n" + ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n" + "ldr q26, [x21, #0x60]\n" + ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n" + ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n" + "ldr q21, [x21, #0x70]\n" + "add x21, x21, #0x88\n" + ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n" + ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n" + ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n" + ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n" + "uzp1 v29.2d, v20.2d, v18.2d\n" + "uzp2 v21.2d, v20.2d, v18.2d\n" + "scvtf v29.4s, v29.4s, #0x4\n" + "uzp1 v18.2d, v9.2d, v0.2d\n" + "uzp2 v16.2d, v9.2d, v0.2d\n" + "scvtf v21.4s, v21.4s, #0x4\n" + "fmla v6.4s, v29.4s, v8.4s\n" + "scvtf v18.4s, v18.4s, #0x4\n" + "scvtf v16.4s, v16.4s, #0x4\n" + "fmla v30.4s, v21.4s, v31.4s\n" + "fmla v24.4s, v18.4s, v15.4s\n" + "fmla v14.4s, v16.4s, v27.4s\n" + "bgt 3b\n" + "mov x20, %x[res_ptr]\n" + "subs x27, x27, #0x4\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "str q2, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q10, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q12, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q28, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q11, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q13, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q22, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q23, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q25, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q5, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q7, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q4, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q6, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q30, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q24, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "str q14, [x20, #0x0]\n" + "bne 2b\n" + "mov x20, #0x4\n" + "sub x10, x10, #0x10\n" + "cmp x10, #0x10\n" + "mov %x[res_ptr], x26\n" + "madd %x[a_ptr], x20, x9, %x[a_ptr]\n" + "bge 1b\n" + "4:" // Row loop skip + "cbz x10, 9f\n" + "5:" // Row tail: Row loop + "add x24, %x[b_ptr], #0x8\n" + "mov x23, %x[nc]\n" + "add x22, %x[res_ptr], %x[res_stride], LSL #2\n" + "6:" // Row tail: Column loop + "movi v2.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "add x25, %x[a_ptr], #0x8\n" + "mov x21, %x[nb]\n" + "movi v12.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "7:" // Row tail: Block loop + "ldr q6, [x24, #0x0]\n" + "ldr q5, [x24, #0x10]\n" + "movi v17.16b, #0x4\n" + "movi v8.4s, #0x0\n" + "ldr q4, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "movi v27.4s, #0x0\n" + "movi v0.4s, #0x0\n" + "ldr q31, [x24, #0x20]\n" + "ldr q14, [x24, #0x30]\n" + "movi v29.4s, #0x0\n" + "movi v22.16b, #0xf0\n" + "ldr q11, [x25, #0x20]\n" + "ldr q23, [x25, #0x30]\n" + "sshl v21.16b, v6.16b, v17.16b\n" + "sshl v16.16b, v5.16b, v17.16b\n" + "ldr q20, [x25, #0x40]\n" + "ldr q26, [x25, #0x50]\n" + "and v6.16b, v6.16b, v22.16b\n" + "and v5.16b, v5.16b, v22.16b\n" + "ldr q25, [x25, #0x60]\n" + "ldr q3, [x25, #0x70]\n" + "sshl v19.16b, v31.16b, v17.16b\n" + "sshl v18.16b, v14.16b, v17.16b\n" + "ldr d17, [x25, #-0x8]\n" + ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n" + ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n" + "and v31.16b, v31.16b, v22.16b\n" + ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n" + ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n" + "and v14.16b, v14.16b, v22.16b\n" + "sub x20, x24, #0x8\n" + "ldr d16, [x20, #0x0]\n" + "subs x21, x21, #0x1\n" + "add x25, x25, #0x88\n" + "fcvtl v17.4s, v17.4h\n" + "add x24, x24, #0x48\n" + ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n" + ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n" + ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n" + ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n" + "fcvtl v16.4s, v16.4h\n" + ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n" + ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n" + "fmul v23.4s, v16.4s, v17.s[0]\n" + "fmul v21.4s, v16.4s, v17.s[1]\n" + "fmul v1.4s, v16.4s, v17.s[2]\n" + "fmul v20.4s, v16.4s, v17.s[3]\n" + ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n" + ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n" + ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n" + ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n" + ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n" + ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n" + "uzp1 v19.2d, v8.2d, v27.2d\n" + "uzp2 v18.2d, v8.2d, v27.2d\n" + "scvtf v19.4s, v19.4s, #0x4\n" + "uzp1 v17.2d, v0.2d, v29.2d\n" + "uzp2 v16.2d, v0.2d, v29.2d\n" + "scvtf v18.4s, v18.4s, #0x4\n" + "fmla v2.4s, v19.4s, v23.4s\n" + "scvtf v17.4s, v17.4s, #0x4\n" + "scvtf v16.4s, v16.4s, #0x4\n" + "fmla v10.4s, v18.4s, v21.4s\n" + "fmla v12.4s, v17.4s, v1.4s\n" + "fmla v28.4s, v16.4s, v20.4s\n" + "bgt 7b\n" + "mov x20, %x[res_ptr]\n" + "cmp x10, #0x1\n" + "str q2, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x10, #0x2\n" + "str q10, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x10, #0x3\n" + "str q12, [x20, #0x0]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "str q28, [x20, #0x0]\n" + "8:" // Row tail: Accumulator store skip + "subs x23, x23, #0x4\n" + "add %x[res_ptr], %x[res_ptr], #0x10\n" + "bne 6b\n" + "subs x10, x10, #0x4\n" + "add %x[a_ptr], %x[a_ptr], x9\n" + "mov %x[res_ptr], x22\n" + "bgt 5b\n" + "9:" // Row tail: Row loop skip + : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); + return; +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } +} + +void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) + if (ggml_cpu_get_sve_cnt() == QK8_0) { + const void * b_ptr = vx; + const void * a_ptr = vy; + float * res_ptr = s; + size_t res_stride = bs * sizeof(float); + + __asm__ __volatile__( + "mov x20, #0x4\n" + "mov x13, %x[nr]\n" + "mov z28.s, #-0x4\n" + "mov x12, #0x88\n" + "ptrue p1.b\n" + "whilelt p0.s, XZR, x20\n" + "cmp x13, #0x10\n" + "mul x12, %x[nb], x12\n" + "blt 4f\n" + "1:" // Row loop + "add x11, %x[b_ptr], #0x10\n" + "mov x10, %x[nc]\n" + "add x9, %x[res_ptr], %x[res_stride], LSL #4\n" + "2:" // Column loop + "add x28, %x[a_ptr], #0x8\n" + "mov z24.b, #0x0\n" + "mov z15.b, #0x0\n" + "mov x27, %x[nb]\n" + "add x26, x28, x12\n" + "mov z12.b, #0x0\n" + "mov z0.b, #0x0\n" + "add x25, x26, x12\n" + "mov z13.b, #0x0\n" + "mov z1.b, #0x0\n" + "add x24, x25, x12\n" + "mov z20.b, #0x0\n" + "mov z25.b, #0x0\n" + "mov z11.b, #0x0\n" + "mov z16.b, #0x0\n" + "mov z19.b, #0x0\n" + "mov z26.b, #0x0\n" + "mov z8.b, #0x0\n" + "mov z29.b, #0x0\n" + "mov z27.b, #0x0\n" + "mov z10.b, #0x0\n" + "3:" // Block loop + "ld1b { z30.b }, p1/Z, [x11]\n" + "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n" + "mov z18.s, #0x0\n" + "mov z7.s, #0x0\n" + "ld1rqb { z3.b }, p1/Z, [x28]\n" + "ld1rqb { z5.b }, p1/Z, [x28, #16]\n" + "mov z9.s, #0x0\n" + "mov z22.s, #0x0\n" + "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n" + "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n" + "sub x20, x11, #0x10\n" + "sub x23, x28, #0x8\n" + "lsl z31.b, z30.b, #0x4\n" + "lsl z6.b, z21.b, #0x4\n" + "ld1h { z23.s }, p1/Z, [x20]\n" + "sub x22, x26, #0x8\n" + "and z30.b, z30.b, #0xf0\n" + "and z21.b, z21.b, #0xf0\n" + "sub x21, x25, #0x8\n" + "sub x20, x24, #0x8\n" + "lsl z14.b, z4.b, #0x4\n" + "lsl z2.b, z17.b, #0x4\n" + "subs x27, x27, #0x1\n" + "add x11, x11, #0x90\n" + ".inst 0x451f9872 // smmla z18.s, z3.b, z31.b\n" + ".inst 0x45069867 // smmla z7.s, z3.b, z6.b\n" + "ld1rqb { z3.b }, p1/Z, [x28, #32]\n" + "and z4.b, z4.b, #0xf0\n" + ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n" + ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n" + "ld1rqb { z5.b }, p1/Z, [x28, #48]\n" + "and z17.b, z17.b, #0xf0\n" + "fcvt z23.s, p1/m, z23.h\n" + ".inst 0x450e9872 // smmla z18.s, z3.b, z14.b\n" + ".inst 0x45029867 // smmla z7.s, z3.b, z2.b\n" + "ld1rqb { z3.b }, p1/Z, [x28, #64]\n" + ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n" + ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n" + "ld1rqb { z5.b }, p1/Z, [x28, #80]\n" + "fscale z23.s, p1/m, z23.s, z28.s\n" + ".inst 0x451e9872 // smmla z18.s, z3.b, z30.b\n" + ".inst 0x45159867 // smmla z7.s, z3.b, z21.b\n" + "ld1rqb { z3.b }, p1/Z, [x28, #96]\n" + ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n" + ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n" + "ld1rqb { z5.b }, p1/Z, [x28, #112]\n" + "add x28, x28, #0x88\n" + ".inst 0x45049872 // smmla z18.s, z3.b, z4.b\n" + ".inst 0x45119867 // smmla z7.s, z3.b, z17.b\n" + "ld1h { z3.s }, p0/Z, [x23]\n" + ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n" + ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n" + "fcvt z3.s, p1/m, z3.h\n" + "uzp1 z5.d, z18.d, z7.d\n" + "uzp2 z18.d, z18.d, z7.d\n" + "mov z3.q, z3.q[0]\n" + "uzp1 z7.d, z9.d, z22.d\n" + "uzp2 z22.d, z9.d, z22.d\n" + "fmul z9.s, z23.s, z3.s[0]\n" + "scvtf z5.s, p1/m, z5.s\n" + "scvtf z18.s, p1/m, z18.s\n" + "scvtf z7.s, p1/m, z7.s\n" + "scvtf z22.s, p1/m, z22.s\n" + "fmla z24.s, p1/M, z5.s, z9.s\n" + "ld1rqb { z5.b }, p1/Z, [x26]\n" + "fmul z9.s, z23.s, z3.s[1]\n" + "fmla z15.s, p1/M, z18.s, z9.s\n" + "ld1rqb { z18.b }, p1/Z, [x26, #16]\n" + "fmul z9.s, z23.s, z3.s[2]\n" + "fmul z3.s, z23.s, z3.s[3]\n" + "fmla z12.s, p1/M, z7.s, z9.s\n" + "mov z9.s, #0x0\n" + "ld1h { z7.s }, p0/Z, [x22]\n" + ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n" + "fmla z0.s, p1/M, z22.s, z3.s\n" + "mov z22.s, #0x0\n" + "ld1h { z3.s }, p0/Z, [x21]\n" + ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n" + "ld1rqb { z5.b }, p1/Z, [x26, #32]\n" + "fcvt z7.s, p1/m, z7.h\n" + "fcvt z3.s, p1/m, z3.h\n" + ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n" + ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n" + "ld1rqb { z5.b }, p1/Z, [x26, #64]\n" + "mov z7.q, z7.q[0]\n" + "mov z3.q, z3.q[0]\n" + ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n" + ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n" + "ld1rqb { z5.b }, p1/Z, [x26, #96]\n" + ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n" + ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n" + "uzp1 z5.d, z9.d, z22.d\n" + "scvtf z5.s, p1/m, z5.s\n" + "uzp2 z22.d, z9.d, z22.d\n" + "fmul z9.s, z23.s, z7.s[0]\n" + "scvtf z22.s, p1/m, z22.s\n" + "fmla z13.s, p1/M, z5.s, z9.s\n" + "ld1rqb { z9.b }, p1/Z, [x25]\n" + "fmul z5.s, z23.s, z7.s[1]\n" + "fmla z1.s, p1/M, z22.s, z5.s\n" + "mov z5.s, #0x0\n" + "mov z22.s, #0x0\n" + ".inst 0x451f9a45 // smmla z5.s, z18.b, z31.b\n" + ".inst 0x45069a56 // smmla z22.s, z18.b, z6.b\n" + "ld1rqb { z18.b }, p1/Z, [x26, #48]\n" + ".inst 0x450e9a45 // smmla z5.s, z18.b, z14.b\n" + ".inst 0x45029a56 // smmla z22.s, z18.b, z2.b\n" + "ld1rqb { z18.b }, p1/Z, [x26, #80]\n" + ".inst 0x451e9a45 // smmla z5.s, z18.b, z30.b\n" + ".inst 0x45159a56 // smmla z22.s, z18.b, z21.b\n" + "ld1rqb { z18.b }, p1/Z, [x26, #112]\n" + "add x26, x26, #0x88\n" + ".inst 0x45049a45 // smmla z5.s, z18.b, z4.b\n" + ".inst 0x45119a56 // smmla z22.s, z18.b, z17.b\n" + "uzp1 z18.d, z5.d, z22.d\n" + "scvtf z18.s, p1/m, z18.s\n" + "uzp2 z22.d, z5.d, z22.d\n" + "fmul z5.s, z23.s, z7.s[2]\n" + "fmul z7.s, z23.s, z7.s[3]\n" + "scvtf z22.s, p1/m, z22.s\n" + "fmla z20.s, p1/M, z18.s, z5.s\n" + "ld1rqb { z18.b }, p1/Z, [x25, #16]\n" + "ld1h { z5.s }, p0/Z, [x20]\n" + "fcvt z5.s, p1/m, z5.h\n" + "fmla z25.s, p1/M, z22.s, z7.s\n" + "mov z22.s, #0x0\n" + "mov z7.s, #0x0\n" + ".inst 0x451f9936 // smmla z22.s, z9.b, z31.b\n" + ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n" + "ld1rqb { z9.b }, p1/Z, [x25, #32]\n" + "mov z5.q, z5.q[0]\n" + ".inst 0x450e9936 // smmla z22.s, z9.b, z14.b\n" + ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n" + "ld1rqb { z9.b }, p1/Z, [x25, #64]\n" + ".inst 0x451e9936 // smmla z22.s, z9.b, z30.b\n" + ".inst 0x45159927 // smmla z7.s, z9.b, z21.b\n" + "ld1rqb { z9.b }, p1/Z, [x25, #96]\n" + ".inst 0x45049936 // smmla z22.s, z9.b, z4.b\n" + ".inst 0x45119927 // smmla z7.s, z9.b, z17.b\n" + "uzp1 z9.d, z22.d, z7.d\n" + "scvtf z9.s, p1/m, z9.s\n" + "uzp2 z22.d, z22.d, z7.d\n" + "fmul z7.s, z23.s, z3.s[0]\n" + "scvtf z22.s, p1/m, z22.s\n" + "fmla z11.s, p1/M, z9.s, z7.s\n" + "ld1rqb { z9.b }, p1/Z, [x24]\n" + "fmul z7.s, z23.s, z3.s[1]\n" + "fmla z16.s, p1/M, z22.s, z7.s\n" + "mov z22.s, #0x0\n" + "mov z7.s, #0x0\n" + ".inst 0x451f9a56 // smmla z22.s, z18.b, z31.b\n" + ".inst 0x45069a47 // smmla z7.s, z18.b, z6.b\n" + "ld1rqb { z18.b }, p1/Z, [x25, #48]\n" + ".inst 0x450e9a56 // smmla z22.s, z18.b, z14.b\n" + ".inst 0x45029a47 // smmla z7.s, z18.b, z2.b\n" + "ld1rqb { z18.b }, p1/Z, [x25, #80]\n" + ".inst 0x451e9a56 // smmla z22.s, z18.b, z30.b\n" + ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n" + "ld1rqb { z18.b }, p1/Z, [x25, #112]\n" + "add x25, x25, #0x88\n" + ".inst 0x45049a56 // smmla z22.s, z18.b, z4.b\n" + ".inst 0x45119a47 // smmla z7.s, z18.b, z17.b\n" + "uzp1 z18.d, z22.d, z7.d\n" + "scvtf z18.s, p1/m, z18.s\n" + "uzp2 z7.d, z22.d, z7.d\n" + "fmul z22.s, z23.s, z3.s[2]\n" + "fmul z3.s, z23.s, z3.s[3]\n" + "scvtf z7.s, p1/m, z7.s\n" + "fmla z19.s, p1/M, z18.s, z22.s\n" + "ld1rqb { z18.b }, p1/Z, [x24, #16]\n" + "fmul z22.s, z23.s, z5.s[0]\n" + "fmla z26.s, p1/M, z7.s, z3.s\n" + "mov z3.s, #0x0\n" + "mov z7.s, #0x0\n" + ".inst 0x451f9923 // smmla z3.s, z9.b, z31.b\n" + ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n" + "ld1rqb { z9.b }, p1/Z, [x24, #32]\n" + ".inst 0x450e9923 // smmla z3.s, z9.b, z14.b\n" + ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n" + "mov z9.s, #0x0\n" + ".inst 0x451f9a49 // smmla z9.s, z18.b, z31.b\n" + "mov z31.s, #0x0\n" + ".inst 0x45069a5f // smmla z31.s, z18.b, z6.b\n" + "ld1rqb { z6.b }, p1/Z, [x24, #48]\n" + "ld1rqb { z18.b }, p1/Z, [x24, #64]\n" + ".inst 0x450e98c9 // smmla z9.s, z6.b, z14.b\n" + "fmul z14.s, z23.s, z5.s[1]\n" + ".inst 0x450298df // smmla z31.s, z6.b, z2.b\n" + "ld1rqb { z6.b }, p1/Z, [x24, #80]\n" + "fmul z2.s, z23.s, z5.s[2]\n" + "fmul z23.s, z23.s, z5.s[3]\n" + ".inst 0x451e9a43 // smmla z3.s, z18.b, z30.b\n" + ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n" + "ld1rqb { z5.b }, p1/Z, [x24, #96]\n" + ".inst 0x451e98c9 // smmla z9.s, z6.b, z30.b\n" + ".inst 0x451598df // smmla z31.s, z6.b, z21.b\n" + "ld1rqb { z18.b }, p1/Z, [x24, #112]\n" + "add x24, x24, #0x88\n" + ".inst 0x450498a3 // smmla z3.s, z5.b, z4.b\n" + ".inst 0x451198a7 // smmla z7.s, z5.b, z17.b\n" + ".inst 0x45049a49 // smmla z9.s, z18.b, z4.b\n" + ".inst 0x45119a5f // smmla z31.s, z18.b, z17.b\n" + "uzp1 z18.d, z3.d, z7.d\n" + "uzp2 z5.d, z3.d, z7.d\n" + "scvtf z18.s, p1/m, z18.s\n" + "uzp1 z6.d, z9.d, z31.d\n" + "uzp2 z9.d, z9.d, z31.d\n" + "scvtf z5.s, p1/m, z5.s\n" + "fmla z8.s, p1/M, z18.s, z22.s\n" + "scvtf z6.s, p1/m, z6.s\n" + "scvtf z9.s, p1/m, z9.s\n" + "fmla z29.s, p1/M, z5.s, z14.s\n" + "fmla z27.s, p1/M, z6.s, z2.s\n" + "fmla z10.s, p1/M, z9.s, z23.s\n" + "bgt 3b\n" + "mov x20, %x[res_ptr]\n" + "subs x10, x10, #0x8\n" + "add %x[res_ptr], %x[res_ptr], #0x20\n" + "st1w { z24.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z15.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z12.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z0.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z13.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z1.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z20.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z25.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z11.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z16.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z19.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z26.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z8.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z29.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z27.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "st1w { z10.s }, p1, [x20]\n" + "bne 2b\n" + "mov x20, #0x4\n" + "sub x13, x13, #0x10\n" + "cmp x13, #0x10\n" + "mov %x[res_ptr], x9\n" + "madd %x[a_ptr], x20, x12, %x[a_ptr]\n" + "bge 1b\n" + "4:" // Row loop skip + "cbz x13, 9f\n" + "5:" // Row tail: Row loop + "add x25, %x[b_ptr], #0x10\n" + "mov x24, %x[nc]\n" + "add x23, %x[res_ptr], %x[res_stride], LSL #2\n" + "6:" // Row tail: Column loop + "mov z24.b, #0x0\n" + "mov z15.b, #0x0\n" + "add x28, %x[a_ptr], #0x8\n" + "mov x22, %x[nb]\n" + "mov z12.b, #0x0\n" + "mov z0.b, #0x0\n" + "7:" // Row tail: Block loop + "ld1b { z3.b }, p1/Z, [x25]\n" + "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n" + "mov z2.s, #0x0\n" + "mov z25.s, #0x0\n" + "ld1rqb { z26.b }, p1/Z, [x28]\n" + "ld1rqb { z21.b }, p1/Z, [x28, #16]\n" + "mov z27.s, #0x0\n" + "mov z19.s, #0x0\n" + "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n" + "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n" + "sub x21, x25, #0x10\n" + "sub x20, x28, #0x8\n" + "lsl z20.b, z3.b, #0x4\n" + "lsl z4.b, z6.b, #0x4\n" + "ld1rqb { z10.b }, p1/Z, [x28, #32]\n" + "ld1rqb { z23.b }, p1/Z, [x28, #48]\n" + "and z3.b, z3.b, #0xf0\n" + "and z6.b, z6.b, #0xf0\n" + "ld1rqb { z11.b }, p1/Z, [x28, #64]\n" + "ld1rqb { z7.b }, p1/Z, [x28, #80]\n" + "lsl z8.b, z29.b, #0x4\n" + "lsl z14.b, z16.b, #0x4\n" + "ld1rqb { z18.b }, p1/Z, [x28, #96]\n" + "ld1rqb { z30.b }, p1/Z, [x28, #112]\n" + ".inst 0x45149b42 // smmla z2.s, z26.b, z20.b\n" + ".inst 0x45049b59 // smmla z25.s, z26.b, z4.b\n" + "and z29.b, z29.b, #0xf0\n" + "ld1h { z17.s }, p1/Z, [x21]\n" + ".inst 0x45149abb // smmla z27.s, z21.b, z20.b\n" + ".inst 0x45049ab3 // smmla z19.s, z21.b, z4.b\n" + "and z16.b, z16.b, #0xf0\n" + "ld1h { z4.s }, p0/Z, [x20]\n" + "subs x22, x22, #0x1\n" + "add x28, x28, #0x88\n" + "fcvt z17.s, p1/m, z17.h\n" + "add x25, x25, #0x90\n" + ".inst 0x45089942 // smmla z2.s, z10.b, z8.b\n" + ".inst 0x450e9959 // smmla z25.s, z10.b, z14.b\n" + "fcvt z4.s, p1/m, z4.h\n" + ".inst 0x45089afb // smmla z27.s, z23.b, z8.b\n" + ".inst 0x450e9af3 // smmla z19.s, z23.b, z14.b\n" + "fscale z17.s, p1/m, z17.s, z28.s\n" + "mov z4.q, z4.q[0]\n" + ".inst 0x45039962 // smmla z2.s, z11.b, z3.b\n" + ".inst 0x45069979 // smmla z25.s, z11.b, z6.b\n" + "fmul z23.s, z17.s, z4.s[0]\n" + "fmul z9.s, z17.s, z4.s[1]\n" + "fmul z21.s, z17.s, z4.s[2]\n" + "fmul z4.s, z17.s, z4.s[3]\n" + ".inst 0x450398fb // smmla z27.s, z7.b, z3.b\n" + ".inst 0x450698f3 // smmla z19.s, z7.b, z6.b\n" + ".inst 0x451d9a42 // smmla z2.s, z18.b, z29.b\n" + ".inst 0x45109a59 // smmla z25.s, z18.b, z16.b\n" + ".inst 0x451d9bdb // smmla z27.s, z30.b, z29.b\n" + ".inst 0x45109bd3 // smmla z19.s, z30.b, z16.b\n" + "uzp1 z31.d, z2.d, z25.d\n" + "uzp2 z13.d, z2.d, z25.d\n" + "scvtf z31.s, p1/m, z31.s\n" + "uzp1 z17.d, z27.d, z19.d\n" + "uzp2 z18.d, z27.d, z19.d\n" + "scvtf z13.s, p1/m, z13.s\n" + "fmla z24.s, p1/M, z31.s, z23.s\n" + "scvtf z17.s, p1/m, z17.s\n" + "scvtf z18.s, p1/m, z18.s\n" + "fmla z15.s, p1/M, z13.s, z9.s\n" + "fmla z12.s, p1/M, z17.s, z21.s\n" + "fmla z0.s, p1/M, z18.s, z4.s\n" + "bgt 7b\n" + "mov x20, %x[res_ptr]\n" + "cmp x13, #0x1\n" + "st1w { z24.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x13, #0x2\n" + "st1w { z15.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "cmp x13, #0x3\n" + "st1w { z12.s }, p1, [x20]\n" + "add x20, x20, %x[res_stride]\n" + "ble 8f\n" + "st1w { z0.s }, p1, [x20]\n" + "8:" // Row tail: Accumulator store skip + "subs x24, x24, #0x8\n" + "add %x[res_ptr], %x[res_ptr], #0x20\n" + "bne 6b\n" + "subs x13, x13, #0x4\n" + "add %x[a_ptr], %x[a_ptr], x12\n" + "mov %x[res_ptr], x23\n" + "bgt 5b\n" + "9:" // Row tail: Row loop skip + : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) + : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) + : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); + return; + } +#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) + +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) + float sumf[4][8]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } +} + +void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + + float32x4_t sumf[4]; + for (int m = 0; m < 4; m++) { + sumf[m] = vdupq_n_f32(0); + } + + for (int l = 0; l < nb; l++) { + float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d)); + float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d)); + + int32x4_t sumi_0 = vdupq_n_s32(0); + int32x4_t sumi_1 = vdupq_n_s32(0); + int32x4_t sumi_2 = vdupq_n_s32(0); + int32x4_t sumi_3 = vdupq_n_s32(0); + + for (int k = 0; k < 4; k++) { + int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0); + int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64); + + uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k); + int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4); + int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF); + + sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0); + sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1); + sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2); + sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3); + sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0); + sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1); + sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2); + sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3); + } + + sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0)); + sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1)); + sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2)); + sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3)); + } + + for (int m = 0; m < 4; m++) { + vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]); + } + } + } + return; +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) + { + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; + const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } + } +} diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c new file mode 100644 index 0000000000000..9e33fb3228633 --- /dev/null +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -0,0 +1,2639 @@ +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" +#include "ggml-quants.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "simd-mappings.h" + +#include "../../quants.h" +#include "../../ggml-cpu-impl.h" + +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#define UNUSED GGML_UNUSED + +#if defined(__loongarch_sx) + +static __m128i lsx_packs_w(__m128i a, __m128i b) { + __m128i tmp, tmp1; + tmp = __lsx_vsat_w(a, 15); + tmp1 = __lsx_vsat_w(b, 15); + return __lsx_vpickev_h(tmp1, tmp); +} + +static __m128i lsx_packs_h(__m128i a, __m128i b) { + __m128i tmp, tmp1; + tmp = __lsx_vsat_h(a, 7); + tmp1 = __lsx_vsat_h(b, 7); + return __lsx_vpickev_b(tmp1, tmp); +} + +static __m128i lsx_packus_h(__m128i a, __m128i b) { + __m128i tmp, tmp1; + tmp = __lsx_vsat_hu(a, 7); + tmp1 = __lsx_vsat_hu(b, 7); + return __lsx_vpickev_b(tmp1, tmp); +} + +static __m128i lsx_maddubs_h(__m128i a, __m128i b) { + __m128i tmp1, tmp2; + tmp1 = __lsx_vmulwev_h_b(a, b); + tmp2 = __lsx_vmulwod_h_b(a, b); + return __lsx_vsadd_h(tmp1, tmp2); +} + +static __m128i lsx_madd_h(__m128i a, __m128i b) { + __m128i tmp1, tmp2; + tmp1 = __lsx_vmulwev_w_h(a, b); + tmp2 = __lsx_vmulwod_w_h(a, b); + return __lsx_vadd_w(tmp1, tmp2); +} + +static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) { + v4i32 __ret = {d, c, b, a}; + return (__m128i)__ret; +} + +static __m128i lsx_shuffle_b(__m128i a, __m128i b) { + __m128i mask_f, zero, tmp0, tmp2, mask; + int f = 0x8f; + mask_f = __lsx_vreplgr2vr_b(f); + zero = __lsx_vldi(0); + tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits + tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive + mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask + tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones + return __lsx_vshuf_b(a, zero, tmp2); +} + +static __m128i lsx_hadd_h(__m128i a, __m128i b) { + __m128i tmp1 = __lsx_vpickev_h(b, a); + __m128i tmp2 = __lsx_vpickod_h(b, a); + return __lsx_vadd_h(tmp1, tmp2); +} + +static __m128i lsx_hadd_w(__m128i a, __m128i b) { + __m128i tmp1 = __lsx_vpickev_w(b, a); + __m128i tmp2 = __lsx_vpickod_w(b, a); + return __lsx_vadd_w(tmp1, tmp2); +} + +static __m128 lsx_hadd_s(__m128 a, __m128 b) { + __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a); + __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a); + + return __lsx_vfadd_s(tmp1, tmp2); +} + +static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { + __m128 res_0 =lsx_hadd_s(a, b); + __m128 res_1 =lsx_hadd_s(c, d); + __m128 res =lsx_hadd_s(res_0, res_1); + res =lsx_hadd_s(res, res); + res =lsx_hadd_s(res, res); + + return ((v4f32)res)[0]; +} +#endif + +#if defined(__loongarch_asx) + +#ifdef __clang__ +#define VREGS_PREFIX "$vr" +#define XREGS_PREFIX "$xr" +#else // GCC +#define VREGS_PREFIX "$f" +#define XREGS_PREFIX "$f" +#endif +#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31" +// Convert __m128i to __m256i +static inline __m256i ____m256i(__m128i in) { + __m256i out = __lasx_xvldi(0); + __asm__ volatile ( + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " XREGS_PREFIX"\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " VREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + : [out] "+f" (out) : [in] "f" (in) + ); + return out; +} +// Convert two __m128i to __m256i +static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) { + __m256i out; + __asm__ volatile ( + ".irp i," __ALL_REGS "\n\t" + " .ifc %[hi], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[lo], " VREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".ifnc %[out], %[hi] \n\t" + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " XREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[hi], " VREGS_PREFIX "\\j \n\t" + " xvori.b $xr\\i, $xr\\j, 0 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".endif \n\t" + : [out] "=f" (out), [hi] "+f" (inhi) + : [lo] "f" (inlo) + ); + return out; +} +// Convert __m256i low part to __m128i +static inline __m128i lasx_extracti128_lo(__m256i in) { + __m128i out; + __asm__ volatile ( + ".ifnc %[out], %[in] \n\t" + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " XREGS_PREFIX "\\j \n\t" + " vori.b $vr\\i, $vr\\j, 0 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".endif \n\t" + : [out] "=f" (out) : [in] "f" (in) + ); + return out; +} +// Convert __m256i high part to __m128i +static inline __m128i lasx_extracti128_hi(__m256i in) { + __m128i out; + __asm__ volatile ( + ".irp i," __ALL_REGS "\n\t" + " .ifc %[out], " VREGS_PREFIX "\\i \n\t" + " .irp j," __ALL_REGS "\n\t" + " .ifc %[in], " XREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + : [out] "=f" (out) : [in] "f" (in) + ); + return out; +} + +static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) { + v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7}; + return (__m256i)__ret; +} + +static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) { + v4i64 __ret = {d, c, b, a}; + return (__m256i)__ret; +} + +static __m256i lasx_insertf128( __m128i x, __m128i y) { + return lasx_set_q(x, y); +} + +static __m256i lasx_shuffle_b(__m256i a, __m256i b) { + __m256i mask_f, zero, tmp0, tmp2, mask; + int f = 0x8f; + mask_f = __lasx_xvreplgr2vr_b(f); + zero = __lasx_xvldi(0); + tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits + tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive + mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask + tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones + return __lasx_xvshuf_b(a, zero, tmp2); +} + +static __m256i lasx_extu8_16(__m128i a) { + return __lasx_vext2xv_hu_bu(____m256i(a)); +} + +static __m256i lasx_ext8_16(__m128i a) { + return __lasx_vext2xv_h_b(____m256i(a)); +} + +static __m256i lasx_ext16_32(__m128i a) { + return __lasx_vext2xv_w_h(____m256i(a)); +} + +static __m128i lasx_extracti128( __m256i a, int pos) { + __m128i ret; + if( pos == 0) + { + ret = lasx_extracti128_lo(a); + } else { + ret = lasx_extracti128_hi(a); + } + return ret; +} + +static __m128 lasx_extractf128( __m256 a, int pos) { + __m128 ret; + if( pos == 0) + { + ret = (__m128)lasx_extracti128_lo((__m256i)a); + } else { + ret = (__m128)lasx_extracti128_hi((__m256i)a); + } + return ret; +} + +static __m256i lasx_maddubs_h(__m256i a, __m256i b) { + __m256i tmp1, tmp2; + tmp1 = __lasx_xvmulwev_h_b(a, b); + tmp2 = __lasx_xvmulwod_h_b(a, b); + return __lasx_xvsadd_h(tmp1, tmp2); +} + +static __m256i lasx_madd_h(__m256i a, __m256i b) { + __m256i tmp1, tmp2; + tmp1 = __lasx_xvmulwev_w_h(a, b); + tmp2 = __lasx_xvmulwod_w_h(a, b); + return __lasx_xvadd_w(tmp1, tmp2); +} + +static __m256i lasx_packs_w(__m256i a, __m256i b) { + __m256i tmp, tmp1; + tmp = __lasx_xvsat_w(a, 15); + tmp1 = __lasx_xvsat_w(b, 15); + return __lasx_xvpickev_h(tmp1, tmp); +} + +static __m256i lasx_packs_h(__m256i a, __m256i b) { + __m256i tmp, tmp1; + tmp = __lasx_xvsat_h(a, 7); + tmp1 = __lasx_xvsat_h(b, 7); + return __lasx_xvpickev_b(tmp1, tmp); +} + +static inline __m256i lasx_madd_h_b(__m256i a, __m256i b) { + __m256i tmp1, tmp2; + tmp1 = __lasx_xvmulwev_h_b(a, b); + tmp2 = __lasx_xvmulwod_h_b(a, b); + return __lasx_xvadd_h(tmp1, tmp2); +} + +static inline __m256i lasx_xvrepl128vei_h(__m256i a, const unsigned int b) { + switch (b) { + case 0: return __lasx_xvrepl128vei_h(a, 0); + case 1: return __lasx_xvrepl128vei_h(a, 1); + case 2: return __lasx_xvrepl128vei_h(a, 2); + case 3: return __lasx_xvrepl128vei_h(a, 3); + case 4: return __lasx_xvrepl128vei_h(a, 4); + case 5: return __lasx_xvrepl128vei_h(a, 5); + case 6: return __lasx_xvrepl128vei_h(a, 6); + case 7: return __lasx_xvrepl128vei_h(a, 7); + default: __builtin_unreachable(); + } +} + +static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) { + switch (b) { + case 0: return __lasx_xvandi_b(a, 1 << 0); + case 1: return __lasx_xvandi_b(a, 1 << 1); + case 2: return __lasx_xvandi_b(a, 1 << 2); + case 3: return __lasx_xvandi_b(a, 1 << 3); + case 4: return __lasx_xvandi_b(a, 1 << 4); + case 5: return __lasx_xvandi_b(a, 1 << 5); + case 6: return __lasx_xvandi_b(a, 1 << 6); + case 7: return __lasx_xvandi_b(a, 1 << 7); + default: __builtin_unreachable(); + } +} + +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors + const __m128i ax = __lsx_vsigncov_b(x, x); + // Sign the values of the y vectors + const __m128i sy = __lsx_vsigncov_b(x, y); + // Perform multiplication and create 16-bit values + const __m128i dot = lsx_maddubs_h(ax, sy); + const __m128i ones = __lsx_vreplgr2vr_h(1); + return lsx_madd_h(ones, dot); +} + +// horizontally add 8 floats +static inline float hsum_float_8(const __m256 x) { + __m128 res = lasx_extractf128(x, 1); + res = __lsx_vfadd_s(res, lasx_extractf128(x, 0)); + res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res)); + res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0)); + return ((v4f32)res)[0]; +} + +// horizontally add 8 int32_t +static inline int hsum_i32_8(const __m256i a) { + + __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11); + __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00); + + __m128i tmp1_128 = lasx_extracti128_lo(tmp1); + __m128i tmp2_128 = lasx_extracti128_lo(tmp2); + + __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128); + + __m128i ev = __lsx_vpickev_w(sum128, sum128); + __m128i od = __lsx_vpickod_w(sum128, sum128); + __m128i sum64 = __lsx_vadd_w(ev, od); + + int sum64_1, sum64_2; + sum64_1 = __lsx_vpickve2gr_w(sum64, 0); + sum64_2 = __lsx_vpickve2gr_w(sum64, 1); + + return sum64_1 + sum64_2; +} + +// horizontally add 4 int32_t +static inline int hsum_i32_4(const __m128i a) { + __m128i ev = __lsx_vpickev_w(a, a); + __m128i od = __lsx_vpickod_w(a, a); + __m128i sum64 = __lsx_vadd_w(ev, od); + + int sum64_1, sum64_2; + sum64_1 = __lsx_vpickve2gr_w(sum64, 0); + sum64_2 = __lsx_vpickve2gr_w(sum64, 1); + + return sum64_1 + sum64_2; +} + +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m256i shuf_mask = lasx_set_d( + 0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000); + + __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask); + const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe); + bytes = __lasx_xvor_v(bytes, bit_mask); + return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1)); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { + const __m128i lo = __lsx_vld((const __m128i *)rsi, 0); + __m128i hi = __lsx_vsrli_h(lo, 4); + return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m256i x) { + __m256i v = __lasx_xvpackod_h(x, x); + __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v); + return __lasx_xvffint_s_w(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { + // Perform multiplication and create 16-bit values + const __m256i dot = lasx_maddubs_h(ax, sy); + return sum_i16_pairs_float(dot); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + const __m256i dot = lasx_madd_h_b(x, y); + return sum_i16_pairs_float(dot); +} + +static inline __m128i packNibbles( __m256i bytes ) { + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh + const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF); + __m256i high = __lasx_xvandn_v(lowByte, bytes); + __m256i low = __lasx_xvand_v(lowByte, bytes); + high = __lasx_xvsrli_h(high, 4); + bytes = __lasx_xvor_v(low, high); + // Compress uint16_t lanes into bytes + __m128i *r0 = (__m128i *)&bytes; + __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11); + __m128i *r1 = (__m128i *)&tmp_h128; + + __m128i zero = __lsx_vldi(0); + __m128i tmp, tmp2, tmp3; + + tmp = __lsx_vmax_h(zero, *r0); + tmp2 = __lsx_vsat_hu(tmp, 7); + + tmp = __lsx_vmax_h(zero, *r1); + tmp3 = __lsx_vsat_hu(tmp, 7); + return __lsx_vpickev_b(tmp3, tmp2); +} +#endif //__loongarch_asx + +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__loongarch_asx) + for (int i = 0; i < nb; i++) { + __m256 v0 = (__m256)__lasx_xvld( x , 0); + __m256 v1 = (__m256)__lasx_xvld( x , 32); + __m256 v2 = (__m256)__lasx_xvld( x , 64); + __m256 v3 = (__m256)__lasx_xvld( x , 96); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f ); + __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) ); + + __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) ); + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); + __m128 tmp = max4; + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 )); + const float max_scalar = ((v4f32)max4)[0]; + + // Quantize these floats + const float d = max_scalar / 127.f; + y[i].d = GGML_CPU_FP32_TO_FP16(d); + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; + const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id ); + + // Apply the multiplier + v0 = __lasx_xvfmul_s( v0, mul ); + v1 = __lasx_xvfmul_s( v1, mul ); + v2 = __lasx_xvfmul_s( v2, mul ); + v3 = __lasx_xvfmul_s( v3, mul ); + + // Round to nearest integer + __m256i i0 = __lasx_xvftintrne_w_s( v0 ); + __m256i i1 = __lasx_xvftintrne_w_s( v1 ); + __m256i i2 = __lasx_xvftintrne_w_s( v2 ); + __m256i i3 = __lasx_xvftintrne_w_s( v3 ); + + __m128i ni0 = lasx_extracti128( i0, 0 ); + __m128i ni1 = lasx_extracti128( i0, 1); + __m128i ni2 = lasx_extracti128( i1, 0); + __m128i ni3 = lasx_extracti128( i1, 1); + __m128i ni4 = lasx_extracti128( i2, 0); + __m128i ni5 = lasx_extracti128( i2, 1); + __m128i ni6 = lasx_extracti128( i3, 0); + __m128i ni7 = lasx_extracti128( i3, 1); + + // Convert int32 to int16 + ni0 = lsx_packs_w( ni0, ni1 ); + ni2 = lsx_packs_w( ni2, ni3 ); + ni4 = lsx_packs_w( ni4, ni5 ); + ni6 = lsx_packs_w( ni6, ni7 ); + // Convert int16 to int8 + ni0 = lsx_packs_h( ni0, ni2 ); + ni4 = lsx_packs_h( ni4, ni6 ); + + __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0); + __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0); + + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_ref(x, y, k); +#endif +} + +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__loongarch_asx) + for (int i = 0; i < nb; i++) { + __m256 v0 = (__m256)__lasx_xvld( x , 0 ); + __m256 v1 = (__m256)__lasx_xvld( x , 32 ); + __m256 v2 = (__m256)__lasx_xvld( x , 64 ); + __m256 v3 = (__m256)__lasx_xvld( x , 96 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f ); + __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) ); + + __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) ); + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); + __m128 tmp = max4; + max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 )); + const float max_scalar = ((v4f32)max4)[0]; + + // Quantize these floats + const float d = max_scalar / 127.f; + y[i].d = GGML_CPU_FP32_TO_FP16(d); + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; + const __m256 mul = __lasx_xvreplfr2vr_s( id ); + + // Apply the multiplier + v0 = __lasx_xvfmul_s( v0, mul ); + v1 = __lasx_xvfmul_s( v1, mul ); + v2 = __lasx_xvfmul_s( v2, mul ); + v3 = __lasx_xvfmul_s( v3, mul ); + + // Round to nearest integer + __m256i i0 = __lasx_xvftintrne_w_s( v0 ); + __m256i i1 = __lasx_xvftintrne_w_s( v1 ); + __m256i i2 = __lasx_xvftintrne_w_s( v2 ); + __m256i i3 = __lasx_xvftintrne_w_s( v3 ); + + __m128i ni0 = lasx_extracti128(i0, 0); + __m128i ni1 = lasx_extracti128( i0, 1); + __m128i ni2 = lasx_extracti128( i1, 0); + __m128i ni3 = lasx_extracti128( i1, 1); + __m128i ni4 = lasx_extracti128( i2, 0 ); + __m128i ni5 = lasx_extracti128( i2, 1); + __m128i ni6 = lasx_extracti128( i3, 0); + __m128i ni7 = lasx_extracti128( i3, 1); + + // Compute the sum of the quants and set y[i].s + const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3)); + const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7)); + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1))); + + // Convert int32 to int16 + ni0 = lsx_packs_w( ni0, ni1 ); + ni2 = lsx_packs_w( ni2, ni3 ); + ni4 = lsx_packs_w( ni4, ni5 ); + ni6 = lsx_packs_w( ni6, ni7 ); + // Convert int16 to int8 + ni0 = lsx_packs_h( ni0, ni2 ); + ni4 = lsx_packs_h( ni4, ni6 ); + + __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0); + __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0); + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_1_ref(x, y, k); +#endif +} + + +//===================================== Dot products ================================= + +// +// Helper functions +// + +#if defined(__loongarch_asx) +// shuffles to pick the required scales in dot products +static inline __m256i get_scale_shuffle_q3k(int i) { + static const uint8_t k_shuffle[128] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + }; + return __lasx_xvld((const __m256i*)k_shuffle + i, 0); +} +static inline __m256i get_scale_shuffle_k4(int i) { + static const uint8_t k_shuffle[256] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 + }; + return __lasx_xvld((const __m256i*)k_shuffle + i, 0); +} +static inline __m128i get_scale_shuffle(int i) { + static const uint8_t k_shuffle[128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11, + 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13, + 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15 + }; + return __lsx_vld((const __m128i*)k_shuffle + i, 0); +} +#endif + +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + // Main loop + for (; ib < nb; ++ib) { + /* Compute combined scale for the block */ + const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); + + __m256i qx = bytes_from_nibbles_32(x[ib].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = __lasx_xvreplgr2vr_b( 8 ); + qx = __lasx_xvsub_b( qx, off ); + + __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + /* Multiply q with scale and accumulate */ + acc = __lasx_xvfmadd_s( d, q, acc ); + } + + sumf = hsum_float_8(acc); + +#elif defined(__loongarch_sx) + // set constants + const __m128i low_mask = __lsx_vreplgr2vr_b(0xF); + const __m128i off = __lsx_vreplgr2vr_b(8); + + // Initialize accumulator with zeros + __m128 acc_0 = (__m128)__lsx_vldi(0); + __m128 acc_1 = (__m128)__lsx_vldi(0); + __m128 acc_2 = (__m128)__lsx_vldi(0); + __m128 acc_3 = (__m128)__lsx_vldi(0); + + for (; ib + 1 < nb; ib += 2) { + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); + + const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0); + + __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1); + __m128i by_0 = __lsx_vld((const __m128i *)y[ib].qs, 0); + bx_0 = __lsx_vsub_b(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4)); + __m128i by_1 = __lsx_vld((const __m128i *)(y[ib].qs + 16), 0); + bx_1 = __lsx_vsub_b(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0); + //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) ); + + const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0); + + __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3); + __m128i by_2 = __lsx_vld((const __m128i *)y[ib + 1].qs, 0); + bx_2 = __lsx_vsub_b(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4)); + __m128i by_3 = __lsx_vld((const __m128i *)(y[ib + 1].qs + 16), 0); + bx_3 = __lsx_vsub_b(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = __lsx_vffint_s_w(i32_0); + __m128 p1 = __lsx_vffint_s_w(i32_1); + __m128 p2 = __lsx_vffint_s_w(i32_2); + __m128 p3 = __lsx_vffint_s_w(i32_3); + + // Apply the scale + __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 ); + __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 ); + __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 ); + __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 ); + + // Acummulate + acc_0 = __lsx_vfadd_s(p0_d, acc_0); + acc_1 = __lsx_vfadd_s(p1_d, acc_1); + acc_2 = __lsx_vfadd_s(p2_d, acc_2); + acc_3 = __lsx_vfadd_s(p3_d, acc_3); + } + + sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; +} + +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + float summs = 0; + + // Main loop + for (; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d); + + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + + const __m256 d0v = __lasx_xvreplfr2vr_s( d0 ); + const __m256 d1v = __lasx_xvreplfr2vr_s( d1 ); + + // Compute combined scales + const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i qx = bytes_from_nibbles_32(x[ib].qs); + const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0); + + const __m256 xy = mul_sum_us8_pairs_float(qx, qy); + + // Accumulate d0*d1*x*y + acc = __lasx_xvfmadd_s( d0d1, xy, acc ); + } + + sumf = hsum_float_8(acc) + summs; + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F); + const int v1 = (x[ib].qs[j] >> 4); + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + // Main loop + for (; ib < nb; ++ib) { + /* Compute combined scale for the block */ + const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME + + __m256i qx = bytes_from_nibbles_32(x[ib].qs); + __m256i bxhi = bytes_from_bits_32(x[ib].qh); + bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0)); + qx = __lasx_xvor_v(qx, bxhi); + + __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + /* Multiply q with scale and accumulate */ + acc = __lasx_xvfmadd_s(d, q, acc); + } + + sumf = hsum_float_8(acc); + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; + } + + *s = sumf; +} + +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + float summs = 0.0f; + + // Main loop + for (; ib < nb; ++ib) { + const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d)); + + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + + __m256i qx = bytes_from_nibbles_32(x[ib].qs); + __m256i bxhi = bytes_from_bits_32(x[ib].qh); + bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10)); + qx = __lasx_xvor_v(qx, bxhi); + + const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d)); + const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); + + const __m256 q = mul_sum_us8_pairs_float(qx, qy); + + acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc); + } + + sumf = hsum_float_8(acc) + summs; + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__loongarch_asx) + // Initialize accumulator with zeros + __m256 acc = (__m256)__lasx_xvldi(0); + + // Main loop + for (; ib < nb; ++ib) { + // Compute combined scale for the block + const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0); + __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + // Multiply q with scale and accumulate + acc = __lasx_xvfmadd_s( d, q, acc ); + } + + sumf = hsum_float_8(acc); + +#endif + for (; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j]*y[ib].qs[j]; + } + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __loongarch_asx + + __m256 acc = (__m256)__lasx_xvldi(0); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0); + const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf); + const __m256i mins = lasx_ext8_16(__lsx_vsrli_b(mins_and_scales128, 4)); + const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0)); + + acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc); + + const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; + const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); + + __m256i sumi = __lasx_xvldi(0); + + for (int j = 0; j < QK_K/128; ++j) { + + const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32; + + const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + + const __m256i q2_0 = __lasx_xvandi_b(q2bits, 3); + const __m256i q2_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 2), 3); + const __m256i q2_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 4), 3); + const __m256i q2_3 = __lasx_xvsrli_b(q2bits, 6); + + __m256i p0 = lasx_madd_h_b(q2_0, q8_0); + __m256i p1 = lasx_madd_h_b(q2_1, q8_1); + __m256i p2 = lasx_madd_h_b(q2_2, q8_2); + __m256i p3 = lasx_madd_h_b(q2_3, q8_3); + + p0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p0); + p1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p1); + p2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p2); + p3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p3); + + p0 = __lasx_xvadd_w(p0, p1); + p2 = __lasx_xvadd_w(p2, p3); + + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2)); + } + + acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); + + } + + *s = hsum_float_8(acc); + +#else + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int isum = 0; + int is = 0; + int d; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + d = sc[is++] & 0xF; + int isuml = 0; + for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + d = sc[is++] & 0xF; + isuml = 0; + for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * isum - dmin * summs; + } + *s = sumf; +#endif +} + +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __loongarch_asx + + const __m128i m32 = __lsx_vreplgr2vr_b(32); + + __m256 acc = (__m256)__lasx_xvldi(0); + + uint32_t aux[3]; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + // Set up scales + memcpy(aux, x[i].scales, 12); + __m128i scales128 = lsx_set_w( + ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), + ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), + (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), + (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); + scales128 = __lsx_vsub_b(scales128, m32); + + const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; + const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); + + // high bit + const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0); + + // integer accumulator + __m256i sumi = __lasx_xvldi(0); + + for (int j = 0; j < QK_K/128; ++j) { + // load low 2 bits + const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32; + + // prepare low and high bits + const __m256i q3l_0 = __lasx_xvandi_b(q3bits, 3); + const __m256i q3l_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 2), 3); + const __m256i q3l_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 4), 3); + const __m256i q3l_3 = __lasx_xvsrli_b(q3bits, 6); + const __m256i q3h_0 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 0), 0), 2); + const __m256i q3h_1 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 1), 0), 2); + const __m256i q3h_2 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 2), 0), 2); + const __m256i q3h_3 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 3), 0), 2); + const __m256i q3_0 = __lasx_xvor_v(q3h_0, q3l_0); + const __m256i q3_1 = __lasx_xvor_v(q3h_1, q3l_1); + const __m256i q3_2 = __lasx_xvor_v(q3h_2, q3l_2); + const __m256i q3_3 = __lasx_xvor_v(q3h_3, q3l_3); + + // load Q8 quants + const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + + __m256i p16_0 = lasx_madd_h_b(q8_0, q3_0); + __m256i p16_1 = lasx_madd_h_b(q8_1, q3_1); + __m256i p16_2 = lasx_madd_h_b(q8_2, q3_2); + __m256i p16_3 = lasx_madd_h_b(q8_3, q3_3); + + // multiply with scales + p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0); + p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1); + p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2); + p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3); + + // accumulate + p16_0 = __lasx_xvadd_w(p16_0, p16_1); + p16_2 = __lasx_xvadd_w(p16_2, p16_3); + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2)); + } + // multiply with block scale and accumulate + acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); + } + + *s = hsum_float_8(acc); + +#else + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; + + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +#endif + +} + +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined __loongarch_asx + + __m256 acc = (__m256)__lasx_xvldi(0); + __m128 acc_m = (__m128)__lsx_vldi(0); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]); + const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128); + const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0); + + const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0); + const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1)); + const __m128i prod = lsx_madd_h(mins128, q8s); + acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m); + + const __m256i scales = lasx_insertf128(scales128, scales128); + + __m256i sumi = __lasx_xvldi(0); + + for (int j = 0; j < QK_K/64; ++j) { + + const __m256i scale_l = lasx_xvrepl128vei_h(scales, 2 * j + 0); + const __m256i scale_h = lasx_xvrepl128vei_h(scales, 2 * j + 1); + + const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; + const __m256i q4l = __lasx_xvandi_b(q4bits, 0xf); + const __m256i q4h = __lasx_xvsrli_b(q4bits, 4); + + const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + __m256i p16l = lasx_madd_h_b(q4l, q8l); + p16l = lasx_madd_h(scale_l, p16l); + + const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + __m256i p16h = lasx_madd_h_b(q4h, q8h); + p16h = lasx_madd_h(scale_h, p16h); + const __m256i sumj = __lasx_xvadd_w(p16l, p16h); + + sumi = __lasx_xvadd_w(sumi, sumj); + } + + __m256 vd = __lasx_xvreplfr2vr_s(d); + acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); + + } + + acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee)); + __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0); + acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); + + + *s = hsum_float_8(acc) + ((v4f32)acc_m)[0]; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined __loongarch_asx + + __m256 acc = (__m256)__lasx_xvldi(0); + __m128 acc_m = (__m128)__lsx_vldi(0); + + for (int i = 0; i < nb; ++i) { + + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]); + const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128); + const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0); + + const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0); + const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1)); + const __m128i prod = lsx_madd_h(mins128, q8s); + acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m); + + const __m256i scales = lasx_insertf128(scales128, scales128); + + const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0); + + __m256i sumi = __lasx_xvldi(0); + + for (int j = 0; j < QK_K/64; ++j) { + + const __m256i scale_0 = lasx_xvrepl128vei_h(scales, 2 * j + 0); + const __m256i scale_1 = lasx_xvrepl128vei_h(scales, 2 * j + 1); + + const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32; + + const __m256i q5l_0 = __lasx_xvandi_b(q5bits, 0xf); + const __m256i q5l_1 = __lasx_xvsrli_b(q5bits, 4); + const __m256i q5h_0 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 0), 0), 0xef); + const __m256i q5h_1 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 1), 0), 0xef); + const __m256i q5_0 = __lasx_xvor_v(q5l_0, q5h_0); + const __m256i q5_1 = __lasx_xvor_v(q5l_1, q5h_1); + + const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + + __m256i p16_0 = lasx_madd_h_b(q5_0, q8_0); + __m256i p16_1 = lasx_madd_h_b(q5_1, q8_1); + + p16_0 = lasx_madd_h(scale_0, p16_0); + p16_1 = lasx_madd_h(scale_1, p16_1); + + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); + + } + + __m256 vd = __lasx_xvreplfr2vr_s(d); + acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); + + } + + acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 8)); + acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4)); + + *s = hsum_float_8(acc) + ((v4f32)acc_m)[0]; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __loongarch_asx + + const __m256i m32s = __lasx_xvreplgr2vr_b(32); + + __m256 acc = (__m256)__lasx_xvldi(0); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0); + const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; + const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); + + __m256i sumi = __lasx_xvldi(0); + + for (int j = 0; j < QK_K/128; ++j) { + + const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; + const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; + const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32; + + const __m256i q4h_0 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3), 4); + const __m256i q4h_1 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3 << 2), 2); + const __m256i q4h_2 = __lasx_xvandi_b(q4bitsH, 3 << 4); + const __m256i q4h_3 = __lasx_xvsrli_b(__lasx_xvandi_b(q4bitsH, 3 << 6), 2); + + const __m256i q4_0 = __lasx_xvor_v(__lasx_xvandi_b(q4bits1, 0xf), q4h_0); + const __m256i q4_1 = __lasx_xvor_v(__lasx_xvandi_b(q4bits2, 0xf), q4h_1); + const __m256i q4_2 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits1, 4), q4h_2); + const __m256i q4_3 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits2, 4), q4h_3); + + const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + + __m256i p16_0 = lasx_madd_h_b(__lasx_xvsub_b(q4_0, m32s), q8_0); + __m256i p16_1 = lasx_madd_h_b(__lasx_xvsub_b(q4_1, m32s), q8_1); + __m256i p16_2 = lasx_madd_h_b(__lasx_xvsub_b(q4_2, m32s), q8_2); + __m256i p16_3 = lasx_madd_h_b(__lasx_xvsub_b(q4_3, m32s), q8_3); + + p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0); + p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1); + p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2); + p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3); + + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3)); + } + + acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); + } + + *s = hsum_float_8(acc); + +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +#if defined(__loongarch_asx) +static const int8_t keven_signs_q2xs[1024] = { + 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, + 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, + 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, + 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, + 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, + 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, + 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, + 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, + 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, + 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, + 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, + 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, + 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, + 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, + 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, + 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, + 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, + 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, + 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, + 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, + 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, + 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, + 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, + 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, + 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, + 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, + 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, + 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, + 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, + 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, + 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, + 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, +}; +#endif + +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__loongarch_asx) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; + + const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); + const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); + const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127], + signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); + const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1); + const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2); + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); + const uint16_t ls1 = aux32[1] >> 28; + const uint16_t ls2 = aux32[3] >> 28; + const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); + const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); + sumi1 = __lasx_xvadd_w(sumi1, p1); + sumi2 = __lasx_xvadd_w(sumi2, p2); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + uint32_t aux32[2]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(aux32, q2, 2*sizeof(uint32_t)); + q2 += 4; + const uint32_t ls = 2*(aux32[1] >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__loongarch_asx) + + const __m256i mone = __lasx_xvreplgr2vr_b(1); + static const char block_sign_shuffle_mask_1[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + }; + static const char block_sign_shuffle_mask_2[32] = { + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, + }; + static const uint8_t bit_selector_mask_bytes[32] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0); + const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0); + const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0); + + static const uint8_t k_bit_helper[32] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0); + const __m256i m511 = __lasx_xvreplgr2vr_h(511); + const __m128i m4 = __lsx_vreplgr2vr_b(0xf); + const __m128i m1 = __lsx_vreplgr2vr_b(1); + + uint64_t aux64; + + // somewhat hacky, but gives a significant boost in performance + __m256i aux_gindex; + const uint16_t * gindex = (const uint16_t *)&aux_gindex; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + __m128i stmp = __lsx_vreplgr2vr_d(aux64); + stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4)); + const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1); + + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { + + const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0); q2 += 16; + aux_gindex = __lasx_xvand_v(q2_data, m511); + + const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9); + const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13); + const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper); + + const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting); + const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits); + + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + + const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]], + iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]); + const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]], + iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]); + const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]], + iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]); + const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]], + iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); + + const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0); + const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1); + const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l); + const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h); + + __m256i signs; + signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1); + signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1); + + signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2); + signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2); + + signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1); + signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3); + + signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2); + signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4); + + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); + const __m256i dot3 = lasx_maddubs_h(q2_3, q8s_3); + const __m256i dot4 = lasx_maddubs_h(q2_4, q8s_4); + + const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0))); + const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1))); + const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2))); + const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3))); + + sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1)); + sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2)); + sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3)); + sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4)); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; + const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls1; + sumi = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls2; + q2 += 4; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__loongarch_asx) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + + const __m128i m4 = __lsx_vreplgr2vr_b(0xf); + const __m128i m1 = __lsx_vreplgr2vr_b(1); + + const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0); + const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0); + uint64_t aux64; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + __m128i tmp1; + memcpy(&aux64, x[i].scales, 8); + tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0); + tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1); + const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1); + const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 + + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], + iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)], + iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], + iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); + const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], + iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)], + iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], + iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); + qs += 8; + + __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16)); + aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); + const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2); + const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1); + + aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16)); + aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); + const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2); + const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2); + + signs += 4; + + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1 + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3 + + const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0))); + const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1))); + sumi1 = __lasx_xvadd_w(sumi1, p1); + sumi2 = __lasx_xvadd_w(sumi2, p2); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint8_t * signs = qs + QK_K/8; + + int bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf); + int ls2 = 1 + 2*(x[i].scales[ib32] >> 4); + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += ls1 * sumi1 + ls2 * sumi2; + qs += 4; + signs += 4; + } + + sumf += d * bsum; + } + + *s = 0.125f * sumf; + +#endif + +} + +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__loongarch_asx) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + memcpy(aux32, gas, 8); gas += 8; + + const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127], + signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); + const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1); + const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2); + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); + const uint16_t ls1 = aux32[0] >> 28; + const uint16_t ls2 = aux32[1] >> 28; + + const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); + const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); + sumi1 = __lasx_xvadd_w(sumi1, p1); + sumi2 = __lasx_xvadd_w(sumi2, p2); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + } + + *s = 0.25f * hsum_float_8(accumf); + +#else + + uint32_t aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); + const uint32_t ls = 2*(aux32 >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]); + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + q3 += 8; + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +#endif +} + +void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__loongarch_asx) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0); + const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0); + + __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8); + const __m256i idx_mask = __lasx_xvreplgr2vr_w(256); + + typedef union { + __m256i vec[2]; + uint32_t index[16]; + } index_t; + + index_t idx; + + __m256 accumf = (__m256)__lasx_xvldi(0); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16; + idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]); + idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]); + idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask); + idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask); + idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0))); + idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1))); + + // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange. + //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4); + //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); + const __m256i q2_1 = lasx_set_w( + iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], + iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] + ); + const __m256i q2_2 = lasx_set_w( + iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], + iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]] + ); + + __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16)); + aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); + const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2); + const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1); + + aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16)); + aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); + const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2); + const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2); + + signs += 4; + + const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); + const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); + const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; + const uint16_t ls2 = x[i].scales[ib32/2] >> 4; + const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); + const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); + sumi1 = __lasx_xvadd_w(sumi1, p1); + sumi2 = __lasx_xvadd_w(sumi2, p2); + } + + accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); + } + + *s = hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint8_t * GGML_RESTRICT signs = x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; + const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls1; + sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls2; + } + sumf += d * bsum; + } + *s = sumf; +#endif +} + +#if defined(__loongarch_asx) +static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { + const __m256i a = __lasx_xvmulwev_h_b(x, y); + const __m256i b = __lasx_xvmulwod_h_b(x, y); + return __lasx_xvadd_h(a, b); +} +#endif + +void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__loongarch_asx) + + __m256 accum = (__m256)__lasx_xvldi(0); + float accum1 = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + __m256i sumi = __lasx_xvldi(0); + int sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ib += 2) { + __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0); + q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1); + q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2); + q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3); + + __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0); + q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1); + q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2); + q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3); + + qs += 8; + const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; + + const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); + const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); + const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; + const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; + + __m256i tmp1, tmp5, tmp6; + tmp1 = __lasx_xvreplgr2vr_h(ls1); + tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1); + tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1); + const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6); + + tmp1 = __lasx_xvreplgr2vr_h(ls2); + tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1); + tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1); + const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6); + + sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2)); + sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 + + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; + } + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum); + accum1 += d * sumi1; + } + + *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; + +#else + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + int sumi = 0, sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + const int ls = 2*((qh[ib] >> 12) & 7) + 1; + const int delta = qh[ib] & 0x8000 ? -1 : 1; + int lsum = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8))); + for (int j = 0; j < 8; ++j) { + lsum += q8[j] * grid[j]; + } + q8 += 8; + } + sumi += ls * lsum; + sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]); + qs += 4; + } + + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + } + + *s = sumf; + +#endif +} + +void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + int ib = 0; + float sumf = 0; + +#if defined (__loongarch_asx) + + const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0); + const __m128i m4b = __lsx_vreplgr2vr_b(0x0f); + const __m256i mone = __lasx_xvreplgr2vr_h(1); + + __m256 accum1 = (__m256)__lasx_xvldi(0); + __m256 accum2 = (__m256)__lasx_xvldi(0); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0); + const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0); + const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0); + const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0); + const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)), + lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b))); + const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)), + lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const __m256i p_1 = lasx_madd_h(p16_1, mone); + const __m256i p_2 = lasx_madd_h(p16_2, mone); + accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), + __lasx_xvffint_s_w(p_1), accum1); + accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), + __lasx_xvffint_s_w(p_2), accum2); + } + + sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2)); + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +} + +void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK_K == 0); + + const block_iq4_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__loongarch_asx) + + const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0); + + __m256 accum = (__m256)__lasx_xvldi(0); + + for (int ibl = 0; ibl < nb; ++ibl) { + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + uint16_t sh = x[ibl].scales_h; + __m256i sumi1 = __lasx_xvldi(0); + __m256i sumi2 = __lasx_xvldi(0); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16; + const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16; + const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; + const __m256i q4b_1 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_1, 4)), + __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_1, 0xf))); + const __m256i q4b_2 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_2, 4)), + __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_2, 0xf))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; + const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; + sh >>= 4; + const __m256i p_1 = lasx_madd_h(p16_1, __lasx_xvreplgr2vr_h(ls1)); + const __m256i p_2 = lasx_madd_h(p16_2, __lasx_xvreplgr2vr_h(ls2)); + sumi1 = __lasx_xvadd_w(p_1, sumi1); + sumi2 = __lasx_xvadd_w(p_2, sumi2); + } + accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum); + } + + *s = hsum_float_8(accum); + +#else + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + uint16_t h = x[ibl].scales_h; + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + for (int ib = 0; ib < QK_K/32; ib += 2) { + const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30); + const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30); + h >>= 4; + const float d1 = d4d8*(ls1 - 32); + const float d2 = d4d8*(ls2 - 32); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d1 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + sumi1 = sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d2 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + } + } + *s = sumf; +#endif +} + diff --git a/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp b/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp new file mode 100644 index 0000000000000..fedd6430278c2 --- /dev/null +++ b/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp @@ -0,0 +1,82 @@ +# include "ggml-backend-impl.h" + +#if defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) + +#if defined(__linux__) +#include +#endif + +#include + +struct powerpc_features { + std::string platform = ""; + int power_version = -1; + + bool has_vsx = false; + + powerpc_features() { +#if defined(__linux__) + unsigned long auxval = getauxval(AT_PLATFORM); + if (auxval) { + platform = std::string(reinterpret_cast(auxval)); + // TBD: Do systems exist that return this in uppercase? + if (platform.substr(0, 5) == "power") { + // Extractt a numeric suffix, if one exists + int vpos = -1; + for (int i = platform.length() - 1; i >= 0; i--) { + if (std::isdigit(platform[i])) { + vpos = i; + } else { + break; + } + } + if (vpos > -1) { + power_version = std::stoi(platform.substr(vpos)); + } + } + } +#endif + if (power_version >= 9) { + has_vsx = true; + } + } +}; + +static int ggml_backend_cpu_powerpc_score() { + int score = 1; + powerpc_features pf; + +// Platform scores +#if defined(GGML_USE_POWER7) + if (pf.power_version < 7) { return 0; } + score += 1<<1; +#endif +#if defined(GGML_USE_POWER8) + if (pf.power_version < 8) { return 0; } + score += 1<<2; +#endif +#if defined(GGML_USE_POWER9) + if (pf.power_version < 9) { return 0; } + score += 1<<3; +#endif +#if defined(GGML_USE_POWER10) + if (pf.power_version < 10) { return 0; } + score += 1<<4; +#endif +#if defined(GGML_USE_POWER11) + if (pf.power_version < 11) { return 0; } + score += 1<<5; +#endif + +// Feature scores +#if defined(GGML_USE_VSX) + if (!pf.has_vsx) { return 0; } + score += 1<<6; +#endif + + return score; +} + +GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_powerpc_score) + +#endif // defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c new file mode 100644 index 0000000000000..053d5cbdc7bd8 --- /dev/null +++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c @@ -0,0 +1,2732 @@ +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" +#include "ggml-quants.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "simd-mappings.h" + +#include "../../quants.h" +#include "../../ggml-cpu-impl.h" + +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#define UNUSED GGML_UNUSED + +#if defined(__POWER9_VECTOR__) +#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s +#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) +#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) +#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) +#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) +#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) +#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) +#define B8(c,s ) B7(c,s, c), B7(c,s, s) + +// precomputed tables for expanding 8bits to 8 bytes: +static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 +static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 +#endif + +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__POWER9_VECTOR__) + for (int i = 0; i < nb; i++) { + vector float srcv [8]; + vector float asrcv[8]; + vector float amaxv[8]; + vector signed int vi[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(vec_extract(amaxv[0], 0), + vec_extract(amaxv[0], 1)), + MAX(vec_extract(amaxv[0], 2), + vec_extract(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + const vector float vid = vec_splats(id); + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const vector float v = vec_round(vec_mul(srcv[j], vid)); + vi[j] = vec_cts(v, 0); + } + vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]); + vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]); + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_ref(x, y, k); +#endif +} + +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__POWER9_VECTOR__) + for (int i = 0; i < nb; i++) { + vector float srcv [8]; + vector float asrcv[8]; + vector float amaxv[8]; + vector signed int vi[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(vec_extract(amaxv[0], 0), + vec_extract(amaxv[0], 1)), + MAX(vec_extract(amaxv[0], 2), + vec_extract(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + const vector float vid = vec_splats(id); + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + vector int accv = vec_splats(0); + + for (int j = 0; j < 8; j++) { + const vector float v = vec_round(vec_mul(srcv[j], vid)); + vi[j] = vec_cts(v, 0); + + accv = vec_add(accv, vi[j]); + } + vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]); + vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]); + + accv = vec_add(accv, vec_sld(accv, accv, 4)); + accv = vec_add(accv, vec_sld(accv, accv, 8)); + y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0)); + } + +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_1_ref(x, y, k); +#endif +} + + +//===================================== Dot products ================================= + +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed int v0 = vec_splats((int32_t)0); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + const vector signed char v8 = vec_splats((signed char)0x8); + + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 8 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); + vector float vd = vec_mul(vxd, vyd); + + vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); + vector signed char q8y0 = vec_xl( 0, y[ib].qs); + vector signed char q8y1 = vec_xl(16, y[ib].qs); + + vector signed char q4x0 = vec_and(qxs, lowMask); + vector signed char q4x1 = vec_sr(qxs, v4); + + q4x0 = vec_sub(q4x0, v8); + q4x1 = vec_sub(q4x1, v8); + + vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); + + vector signed int vsumi0 = v0; + + vsumi0 = vec_sum4s(qv0, vsumi0); + vsumi0 = vec_sum4s(qv1, vsumi0); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + sumf = vec_extract(vsumf0, 0); + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; +} + +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed int v0 = vec_splats((int32_t)0); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 4 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); + vector float vd = vec_mul(vxd, vyd); + + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m)); + vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f}; + vsumf0 = vec_madd(vxmin, vys, vsumf0); + + vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); + vector signed char q8y0 = vec_xl( 0, y[ib].qs); + vector signed char q8y1 = vec_xl(16, y[ib].qs); + + vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask); + vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4); + + vector signed int vsumi0 = v0; + + vsumi0 = vec_msum(q8y0, q4x0, vsumi0); + vsumi0 = vec_msum(q8y1, q4x1, vsumi0); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + sumf = vec_extract(vsumf0, 0); + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F); + const int v1 = (x[ib].qs[j] >> 4); + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector unsigned char v4 = vec_splats((unsigned char)4); + + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 4 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); + vector float vd = vec_mul(vxd, vyd); + + vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])}; + vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])}; + + vector signed char qh0 = (vector signed char)aux64x2_0; + vector signed char qh1 = (vector signed char)aux64x2_1; + + vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); + + vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0); + vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1); + + vector signed char q8y0 = vec_xl( 0, y[ib].qs); + vector signed char q8y1 = vec_xl( 16, y[ib].qs); + + vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1)); + + qv0 = vec_add(qv0, qv1); + + vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + sumf = vec_extract(vsumf0, 0); + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; + } + + *s = sumf; +} + +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed int v0 = vec_splats((int32_t)0); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 4 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); + vector float vd = vec_mul(vxd, vyd); + + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m)); + vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f}; + vsumf0 = vec_madd(vxmin, vys, vsumf0); + + vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])}; + vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])}; + + vector signed char qh0 = (vector signed char)aux64x2_0; + vector signed char qh1 = (vector signed char)aux64x2_1; + + vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); + + vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0); + vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1); + + vector signed char q8y0 = vec_xl( 0, y[ib].qs); + vector signed char q8y1 = vec_xl( 16, y[ib].qs); + + vector signed int vsumi0 = v0; + + vsumi0 = vec_msum(q8y0, q5x0, vsumi0); + vsumi0 = vec_msum(q8y1, q5x1, vsumi0); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + sumf = vec_extract(vsumf0, 0); + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__POWER9_VECTOR__) + const vector signed int v0 = vec_splats((int32_t)0); + vector float vsumf0 = vec_splats(0.0f); + +#pragma GCC unroll 8 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); + vector float vd = vec_mul(vxd, vyd); + + vector signed char q8x0 = vec_xl( 0, x[ib].qs); + vector signed char q8x1 = vec_xl(16, x[ib].qs); + vector signed char q8y0 = vec_xl( 0, y[ib].qs); + vector signed char q8y1 = vec_xl(16, y[ib].qs); + + vector signed short qv0 = vec_mule(q8x0, q8y0); + vector signed short qv1 = vec_mulo(q8x0, q8y0); + vector signed short qv2 = vec_mule(q8x1, q8y1); + vector signed short qv3 = vec_mulo(q8x1, q8y1); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + + vsumi0 = vec_sum4s(qv0, vsumi0); + vsumi1 = vec_sum4s(qv1, vsumi1); + vsumi0 = vec_sum4s(qv2, vsumi0); + vsumi1 = vec_sum4s(qv3, vsumi1); + + vsumi0 = vec_add(vsumi0, vsumi1); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + } + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + sumf = vec_extract(vsumf0, 0); + +#endif + for (; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j]*y[ib].qs[j]; + } + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0x3); + const vector signed char lowScaleMask = vec_splats((signed char)0xF); + const vector int v0 = vec_splats((int32_t)0); + const vector unsigned char v2 = vec_splats((unsigned char)0x2); + const vector unsigned char v6 = vec_splats((unsigned char)0x6); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); + vector float vdmin = vec_mul(vxmin, vyd); + + vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); + vector signed short q8ysums1 = vec_xl(16, y[i].bsums); + + vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales); + vector signed char vscales = vec_and(q2xmins, lowScaleMask); + + q2xmins = vec_sr(q2xmins, v4); + vector signed short q2xmins0 = vec_unpackh(q2xmins); + vector signed short q2xmins1 = vec_unpackl(q2xmins); + + vector signed int prod0 = vec_mule(q2xmins0, q8ysums0); + vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0); + vector signed int prod2 = vec_mule(q2xmins1, q8ysums1); + vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1); + + vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); + vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); + vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); + vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + vector signed int vsumi4 = v0; + vector signed int vsumi5 = v0; + vector signed int vsumi6 = v0; + vector signed int vsumi7 = v0; + + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + for (int j = 0; j < QK_K/128; ++j) { + __builtin_prefetch(q2, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q2); + vector signed char qxs1 = (vector signed char)vec_xl(16, q2); + q2 += 32; + + vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask); + vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask); + vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask); + vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask); + vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask); + vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask); + vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask); + vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl( 16, q8); + vector signed char q8y01 = vec_xl( 32, q8); + vector signed char q8y11 = vec_xl( 48, q8); + vector signed char q8y02 = vec_xl( 64, q8); + vector signed char q8y12 = vec_xl( 80, q8); + vector signed char q8y03 = vec_xl( 96, q8); + vector signed char q8y13 = vec_xl(112, q8); + q8 += 128; + + vector signed int qv0 = vec_msum(q8y00, q2x00, v0); + vector signed int qv1 = vec_msum(q8y01, q2x01, v0); + vector signed int qv2 = vec_msum(q8y02, q2x02, v0); + vector signed int qv3 = vec_msum(q8y03, q2x03, v0); + vector signed int qv4 = vec_msum(q8y10, q2x10, v0); + vector signed int qv5 = vec_msum(q8y11, q2x11, v0); + vector signed int qv6 = vec_msum(q8y12, q2x12, v0); + vector signed int qv7 = vec_msum(q8y13, q2x13, v0); + + vector signed short vscales_07 = vec_unpackh(vscales); + vector signed int vscales_03 = vec_unpackh(vscales_07); + vector signed int vscales_47 = vec_unpackl(vscales_07); + vector signed int vs0 = vec_splat(vscales_03, 0); + vector signed int vs1 = vec_splat(vscales_03, 1); + vector signed int vs2 = vec_splat(vscales_03, 2); + vector signed int vs3 = vec_splat(vscales_03, 3); + vector signed int vs4 = vec_splat(vscales_47, 0); + vector signed int vs5 = vec_splat(vscales_47, 1); + vector signed int vs6 = vec_splat(vscales_47, 2); + vector signed int vs7 = vec_splat(vscales_47, 3); + vscales = vec_sld(vscales, vscales, 8); + + vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0); + vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1); + vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2); + vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3); + vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4); + vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5); + vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6); + vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#else + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int isum = 0; + int is = 0; + int d; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + d = sc[is++] & 0xF; + int isuml = 0; + for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + d = sc[is++] & 0xF; + isuml = 0; + for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * isum - dmin * summs; + } + *s = sumf; +#endif +} + +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0x3); + const vector signed char lowMask1 = vec_splats((int8_t)0xf); + const vector signed char lowMask2 = vec_splats((int8_t)0x30); + const vector int v0 = vec_splats((int32_t)0); + const vector signed char v1 = vec_splats((signed char)0x1); + const vector unsigned char v2 = vec_splats((unsigned char)0x2); + const vector unsigned char v3 = vec_splats((unsigned char)0x3); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + const vector unsigned char v6 = vec_splats((unsigned char)0x6); + const vector signed char off = vec_splats((signed char)0x20); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + UNUSED(kmask1); + UNUSED(kmask2); + + vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); + vector signed char u1 = vec_and(u0, lowMask1); + vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); + vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2)); + vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4); + vector signed char u31 = vec_and(u3, lowMask2); + + u1 = vec_or(u1, u30); + u2 = vec_or(vec_sr(u0, v4), u31); + + vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2); + vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask); + vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask); + + vscales = vec_sub(vscales, off); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + vector signed int vsumi4 = v0; + vector signed int vsumi5 = v0; + vector signed int vsumi6 = v0; + vector signed int vsumi7 = v0; + + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + for (int j = 0; j < QK_K/128; ++j) { + __builtin_prefetch(q3, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q3); + vector signed char qxs1 = (vector signed char)vec_xl(16, q3); + q3 += 32; + + //the low 2 bits + vector signed char qxs00 = vec_and(qxs0, lowMask); + vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask); + vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask); + vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask); + vector signed char qxs10 = vec_and(qxs1, lowMask); + vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask); + vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask); + vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask); + + //the 3rd bit + vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2); + vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2); + vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2); + vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2); + vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2); + vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2); + vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2); + vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2); + qxhs0 = vec_sr(qxhs0, v4); + qxhs1 = vec_sr(qxhs1, v4); + + vector signed char q3x00 = vec_sub(qxs00, qxh00); + vector signed char q3x01 = vec_sub(qxs01, qxh01); + vector signed char q3x02 = vec_sub(qxs02, qxh02); + vector signed char q3x03 = vec_sub(qxs03, qxh03); + vector signed char q3x10 = vec_sub(qxs10, qxh10); + vector signed char q3x11 = vec_sub(qxs11, qxh11); + vector signed char q3x12 = vec_sub(qxs12, qxh12); + vector signed char q3x13 = vec_sub(qxs13, qxh13); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl( 16, q8); + vector signed char q8y01 = vec_xl( 32, q8); + vector signed char q8y11 = vec_xl( 48, q8); + vector signed char q8y02 = vec_xl( 64, q8); + vector signed char q8y12 = vec_xl( 80, q8); + vector signed char q8y03 = vec_xl( 96, q8); + vector signed char q8y13 = vec_xl(112, q8); + q8 += 128; + + vector signed short vscales_h = vec_unpackh(vscales); + vector signed short vs0 = vec_splat(vscales_h, 0); + vector signed short vs1 = vec_splat(vscales_h, 1); + vector signed short vs2 = vec_splat(vscales_h, 2); + vector signed short vs3 = vec_splat(vscales_h, 3); + vector signed short vs4 = vec_splat(vscales_h, 4); + vector signed short vs5 = vec_splat(vscales_h, 5); + vector signed short vs6 = vec_splat(vscales_h, 6); + vector signed short vs7 = vec_splat(vscales_h, 7); + vscales = vec_sld(vscales, vscales, 8); + + vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00)); + vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01)); + vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02)); + vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03)); + vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10)); + vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11)); + vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12)); + vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13)); + + vsumi0 = vec_msum(qv00, vs0, vsumi0); + vsumi1 = vec_msum(qv01, vs2, vsumi1); + vsumi2 = vec_msum(qv02, vs4, vsumi2); + vsumi3 = vec_msum(qv03, vs6, vsumi3); + vsumi4 = vec_msum(qv10, vs1, vsumi4); + vsumi5 = vec_msum(qv11, vs3, vsumi5); + vsumi6 = vec_msum(qv12, vs5, vsumi6); + vsumi7 = vec_msum(qv13, vs7, vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#else + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; + + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +#endif + +} + +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed char lowMask1 = vec_splats((int8_t)0x3f); + const vector signed char lowMask2 = vec_splats((int8_t)0x30); + const vector int v0 = vec_splats((int32_t)0); + const vector unsigned char v2 = vec_splats((uint8_t)2); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); + vector float vdmin = vec_mul(vxmin, vyd); + + vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); + vector signed short q8ysums1 = vec_xl(16, y[i].bsums); + + UNUSED(kmask1); + UNUSED(kmask2); + UNUSED(kmask3); + UNUSED(utmp); + + vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); + vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2); + vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); + vector signed char u3 = vec_sr(u2, v4); + + vector signed char u30 = u1; + vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3); + + u1 = vec_and(u0, lowMask1); + u2 = vec_or(u30, u31); + + vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2); + + vector signed short vscales = vec_unpackh(utmps); + vector signed short q4xmins = vec_unpackl(utmps); + vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins); + vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins); + + vector signed int prod0 = vec_mule(q4xmins0, q8ysums0); + vector signed int prod1 = vec_mule(q4xmins1, q8ysums1); + vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0); + vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1); + + vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); + vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); + vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); + vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + for (int j = 0; j < QK_K/64; j+=2) { + __builtin_prefetch(q4, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q4); + vector signed char qxs1 = (vector signed char)vec_xl(16, q4); + vector signed char qxs2 = (vector signed char)vec_xl(32, q4); + vector signed char qxs3 = (vector signed char)vec_xl(48, q4); + q4 += 64; + + vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask); + vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4); + vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask); + vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4); + vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask); + vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4); + vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask); + vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl( 16, q8); + vector signed char q8y01 = vec_xl( 32, q8); + vector signed char q8y11 = vec_xl( 48, q8); + vector signed char q8y20 = vec_xl( 64, q8); + vector signed char q8y30 = vec_xl( 80, q8); + vector signed char q8y21 = vec_xl( 96, q8); + vector signed char q8y31 = vec_xl(112, q8); + q8 += 128; + + vector signed int qv00 = vec_msum(q8y00, q4x00, v0); + vector signed int qv01 = vec_msum(q8y01, q4x01, v0); + vector signed int qv10 = vec_msum(q8y10, q4x10, v0); + vector signed int qv11 = vec_msum(q8y11, q4x11, v0); + vector signed int qv20 = vec_msum(q8y20, q4x20, v0); + vector signed int qv21 = vec_msum(q8y21, q4x21, v0); + vector signed int qv30 = vec_msum(q8y30, q4x30, v0); + vector signed int qv31 = vec_msum(q8y31, q4x31, v0); + + vector signed int vscales_h = vec_unpackh(vscales); + vector signed int vs0 = vec_splat(vscales_h, 0); + vector signed int vs1 = vec_splat(vscales_h, 1); + vector signed int vs2 = vec_splat(vscales_h, 2); + vector signed int vs3 = vec_splat(vscales_h, 3); + vscales = vec_sld(vscales, vscales, 8); + + vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0); + vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1); + vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2); + vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3); + + vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0); + vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1); + vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2); + vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed char lowMask1 = vec_splats((int8_t)0x3f); + const vector signed char lowMask2 = vec_splats((int8_t)0x30); + const vector int v0 = vec_splats((int32_t)0); + const vector unsigned char v1 = vec_splats((unsigned char)0x1); + const vector unsigned char v2 = vec_splats((unsigned char)0x2); + const vector unsigned char v3 = vec_splats((unsigned char)0x3); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); + vector float vdmin = vec_mul(vxmin, vyd); + + UNUSED(kmask1); + UNUSED(kmask2); + UNUSED(kmask3); + UNUSED(utmp); + + vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); + vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2); + vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); + vector signed char u3 = vec_sr(u2, v4); + + vector signed char u30 = u1; + vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3); + + u1 = vec_and(u0, lowMask1); + u2 = vec_or(u30, u31); + + vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2); + + vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); + vector signed short q8ysums1 = vec_xl(16, y[i].bsums); + + vector signed short vscales = vec_unpackh(utmps); + + vector signed short q5xmins = vec_unpackl(utmps); + vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins); + vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins); + + vector signed int prod0 = vec_mule(q5xmins0, q8ysums0); + vector signed int prod1 = vec_mule(q5xmins1, q8ysums1); + vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0); + vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1); + + vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); + vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); + vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); + vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); + + vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh); + vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + for (int j = 0; j < QK_K/64; ++j) { + __builtin_prefetch(q5, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q5); + vector signed char qxs1 = (vector signed char)vec_xl(16, q5); + q5 += 32; + + vector signed char qxs00 = vec_and(qxs0, lowMask); + vector signed char qxs01 = vec_sr(qxs0, v4); + vector signed char qxs10 = vec_and(qxs1, lowMask); + vector signed char qxs11 = vec_sr(qxs1, v4); + + vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4); + vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3); + vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4); + vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3); + qxhs0 = vec_sr(qxhs0, v2); + qxhs1 = vec_sr(qxhs1, v2); + + vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00); + vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01); + vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10); + vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl(16, q8); + vector signed char q8y01 = vec_xl(32, q8); + vector signed char q8y11 = vec_xl(48, q8); + q8 += 64; + + vector signed int qv00 = vec_msum(q8y00, q5x00, v0); + vector signed int qv01 = vec_msum(q8y01, q5x01, v0); + vector signed int qv10 = vec_msum(q8y10, q5x10, v0); + vector signed int qv11 = vec_msum(q8y11, q5x11, v0); + + vector signed int vscales_h = vec_unpackh(vscales); + vector signed int vs0 = vec_splat(vscales_h, 0); + vector signed int vs1 = vec_splat(vscales_h, 1); + vscales = vec_sld(vscales, vscales, 12); + + vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0); + vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1); + vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2); + vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector int v0 = vec_splats((int32_t)0); + const vector unsigned char v2 = vec_splats((unsigned char)0x2); + const vector unsigned char v3 = vec_splats((unsigned char)0x3); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + const vector unsigned char v6 = vec_splats((unsigned char)0x6); + const vector signed char off = vec_splats((signed char)0x20); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + vector signed int vsumi4 = v0; + vector signed int vsumi5 = v0; + vector signed int vsumi6 = v0; + vector signed int vsumi7 = v0; + + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT qs = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + for (int j = 0; j < QK_K/128; ++j) { + __builtin_prefetch(q6, 0, 0); + __builtin_prefetch(qh, 0, 0); + __builtin_prefetch(q8, 0, 0); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q6); + vector signed char qxs1 = (vector signed char)vec_xl(16, q6); + vector signed char qxs2 = (vector signed char)vec_xl(32, q6); + vector signed char qxs3 = (vector signed char)vec_xl(48, q6); + q6 += 64; + + vector signed char qxs00 = vec_and(qxs0, lowMask); + vector signed char qxs01 = vec_sr(qxs0, v4); + vector signed char qxs10 = vec_and(qxs1, lowMask); + vector signed char qxs11 = vec_sr(qxs1, v4); + vector signed char qxs20 = vec_and(qxs2, lowMask); + vector signed char qxs21 = vec_sr(qxs2, v4); + vector signed char qxs30 = vec_and(qxs3, lowMask); + vector signed char qxs31 = vec_sr(qxs3, v4); + + vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh); + vector signed char qxhs1 = (vector signed char)vec_xl(16, qh); + qh += 32; + + vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4); + vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4); + vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4); + vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4); + vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4); + vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4); + vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4); + vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4); + + vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off); + vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off); + vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off); + vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off); + vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off); + vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off); + vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off); + vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off); + + vector signed char q8y00 = vec_xl( 0, q8); + vector signed char q8y10 = vec_xl( 16, q8); + vector signed char q8y20 = vec_xl( 32, q8); + vector signed char q8y30 = vec_xl( 48, q8); + vector signed char q8y01 = vec_xl( 64, q8); + vector signed char q8y11 = vec_xl( 80, q8); + vector signed char q8y21 = vec_xl( 96, q8); + vector signed char q8y31 = vec_xl(112, q8); + q8 += 128; + + vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00)); + vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10)); + vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20)); + vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30)); + vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01)); + vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11)); + vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21)); + vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31)); + + vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8)); + qs += 8; + + vector signed short vs0 = vec_splat(vscales, 0); + vector signed short vs1 = vec_splat(vscales, 1); + vector signed short vs2 = vec_splat(vscales, 2); + vector signed short vs3 = vec_splat(vscales, 3); + vector signed short vs4 = vec_splat(vscales, 4); + vector signed short vs5 = vec_splat(vscales, 5); + vector signed short vs6 = vec_splat(vscales, 6); + vector signed short vs7 = vec_splat(vscales, 7); + + vsumi0 = vec_msum(qv00, vs0, vsumi0); + vsumi1 = vec_msum(qv01, vs4, vsumi1); + vsumi2 = vec_msum(qv10, vs1, vsumi2); + vsumi3 = vec_msum(qv11, vs5, vsumi3); + vsumi4 = vec_msum(qv20, vs2, vsumi4); + vsumi5 = vec_msum(qv21, vs6, vsumi5); + vsumi6 = vec_msum(qv30, vs3, vsumi6); + vsumi7 = vec_msum(qv31, vs7, vsumi7); + } + + vsumi0 = vec_add(vsumi0, vsumi4); + vsumi1 = vec_add(vsumi1, vsumi5); + vsumi2 = vec_add(vsumi2, vsumi6); + vsumi3 = vec_add(vsumi3, vsumi7); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +#if defined (__POWER9_VECTOR__) +static const int8_t keven_signs_q2xs[1024] = { + 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, + 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, + 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, + 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, + 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, + 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, + 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, + 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, + 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, + 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, + 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, + 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, + 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, + 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, + 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, + 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, + 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, + 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, + 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, + 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, + 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, + 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, + 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, + 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, + 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, + 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, + 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, + 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, + 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, + 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, + 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, + 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, +}; +#endif + +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + const vector int v0 = vec_splats((int32_t)0); + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q2, 0, 1); + __builtin_prefetch(q8, 0, 1); + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + memcpy(aux32, q2, 4*sizeof(uint32_t)); + q2 += 8; + + vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])}; + vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])}; + vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])}; + vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])}; + + vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127))}; + vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))}; + vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127))}; + vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))}; + + vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0); + vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1); + vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2); + vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); + + const uint16_t ls0 = aux32[1] >> 28; + const uint16_t ls1 = aux32[3] >> 28; + + vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1)); + vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1)); + + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = 0.125f * vec_extract(vsumf0, 0); + +#else + + uint32_t aux32[2]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(aux32, q2, 2*sizeof(uint32_t)); + q2 += 4; + const uint32_t ls = 2*(aux32[1] >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + const vector int v0 = vec_splats((int32_t)0); + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + for (int j = 0; j < QK_K/64; ++j) { + __builtin_prefetch(q2, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))}; + vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))}; + vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))}; + vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))}; + + vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))}; + vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))}; + vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))}; + vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))}; + q2 += 8; + + vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0); + vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1); + vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2); + vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); + + const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); + const uint16_t ls1 = (uint16_t)(sc[0] >> 4); + const uint16_t ls2 = (uint16_t)(sc[1] & 0xf); + const uint16_t ls3 = (uint16_t)(sc[1] >> 4); + sc += 2; + + vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1)); + vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1)); + vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1)); + vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1)); + + vsumi0 = vec_msum(qv0, vscales0, vsumi0); + vsumi1 = vec_msum(qv1, vscales1, vsumi1); + vsumi2 = vec_msum(qv2, vscales2, vsumi2); + vsumi3 = vec_msum(qv3, vscales3, vsumi3); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = 0.125f * vec_extract(vsumf0, 0); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; + const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls1; + sumi = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls2; + q2 += 4; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + + const vector int v0 = vec_splats((int32_t)0); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const vector unsigned char mask0 = vec_xl( 0, k_mask1); + const vector unsigned char mask1 = vec_xl(16, k_mask1); + const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q2, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))}; + vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))}; + vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))}; + vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))}; + q2 += 8; + qh += 2; + + vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]); + vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]); + signs += 4; + + vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0); + vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1); + vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0); + vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1); + + vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2); + vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2); + vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2); + vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2); + + vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0); + vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1); + vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2); + vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); + + const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); + const uint16_t ls1 = (uint16_t)(sc[0] >> 4); + const uint16_t ls2 = (uint16_t)(sc[1] & 0xf); + const uint16_t ls3 = (uint16_t)(sc[1] >> 4); + sc += 2; + + vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1)); + vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1)); + vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1)); + vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1)); + + vsumi0 = vec_msum(qv0, vscales0, vsumi0); + vsumi1 = vec_msum(qv1, vscales1, vsumi1); + vsumi2 = vec_msum(qv2, vscales2, vsumi2); + vsumi3 = vec_msum(qv3, vscales3, vsumi3); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = 0.125f * vec_extract(vsumf0, 0); + +#else + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint8_t * signs = qs + QK_K/8; + + int bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf); + int ls2 = 1 + 2*(x[i].scales[ib32] >> 4); + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += ls1 * sumi1 + ls2 * sumi2; + qs += 4; + signs += 4; + } + + sumf += d * bsum; + } + + *s = 0.125f * sumf; + +#endif + +} + +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + const vector int v0 = vec_splats((int32_t)0); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4); + const int8_t * GGML_RESTRICT q8 = y[i].qs; + +#pragma GCC unroll 1 + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q3, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]}; + vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]}; + vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]}; + vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]}; + q3 += 16; + + vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >> 0) & 127]), (uint64_t)(signs64[(signs[0] >> 7) & 127])}; + vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])}; + vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >> 0) & 127]), (uint64_t)(signs64[(signs[1] >> 7) & 127])}; + vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])}; + + vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0); + vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1); + vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2); + vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3)); + + const uint16_t ls0 = (uint16_t)(signs[0] >> 28); + const uint16_t ls1 = (uint16_t)(signs[1] >> 28); + signs += 2; + + vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); + vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); + + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = 0.25f * vec_extract(vsumf0, 0); + +#else + + uint32_t aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); + const uint32_t ls = 2*(aux32 >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]); + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + q3 += 8; + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +#endif +} + +void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + + const vector int v0 = vec_splats((int32_t)0); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const vector unsigned char mask0 = vec_xl( 0, k_mask1); + const vector unsigned char mask1 = vec_xl(16, k_mask1); + const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs); + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q3, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)], + iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]}; + vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)], + iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]}; + vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)], + iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]}; + vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)], + iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]}; + q3 += 16; + qh += 2; + + vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]); + vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]); + signs += 4; + + vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0); + vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1); + vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0); + vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1); + + vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2); + vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2); + vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2); + vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2); + + vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0); + vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1); + vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2); + vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3)); + + const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); + const uint16_t ls1 = (uint16_t)(sc[0] >> 4); + sc ++; + + vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); + vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); + + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint8_t * GGML_RESTRICT signs = x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; + const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls1; + sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls2; + } + sumf += d * bsum; + } + *s = sumf; +#endif +} + +void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + const vector unsigned char v0 = vec_splats((unsigned char)0x0); + const vector unsigned short vsign = vec_splats((unsigned short)0x8000); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + for (int i = 0; i < nb; ++i) { + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); + vector float vyd = vec_splats(y[i].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = vec_splats((int32_t)0); + vector signed int vsumi1 = vec_splats((int32_t)0); + vector signed int vsumi2 = vec_splats((int32_t)0); + vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi8 = vec_splats((int32_t)0); + + const uint8_t * GGML_RESTRICT q1 = x[i].qs; + const uint16_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const int16_t * GGML_RESTRICT qs = y[i].bsums; + + for (int j = 0; j < QK_K/32; j += 2) { + __builtin_prefetch(q1, 0, 1); + __builtin_prefetch(qh, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))}; + vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))}; + vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))}; + vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))}; + q1 += 8; + + vector signed char q1x0 = (vector signed char)aux64x2_0; + vector signed char q1x1 = (vector signed char)aux64x2_1; + vector signed char q1x2 = (vector signed char)aux64x2_2; + vector signed char q1x3 = (vector signed char)aux64x2_3; + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3)); + + const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7); + const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7); + + vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); + vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); + vector signed short vscales = vec_sld(vscales23, vscales01, 8); + + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); + + vector signed short q8ysums = vec_xl_len(qs, 8); + qs += 4; + q8ysums = vec_mergeh(q8ysums, (vector signed short)v0); + + vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8); + qh += 2; + vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0); + + vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel); + + vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + + vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#else + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + int sumi = 0, sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + const int ls = 2*((qh[ib] >> 12) & 7) + 1; + const int delta = qh[ib] & 0x8000 ? -1 : 1; + int lsum = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8))); + for (int j = 0; j < 8; ++j) { + lsum += q8[j] * grid[j]; + } + q8 += 8; + } + sumi += ls * lsum; + sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]); + qs += 4; + } + + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + } + + *s = sumf; + +#endif +} + +void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + int ib = 0; + float sumf = 0; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed int v0 = vec_splats((int32_t)0); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + + const vector signed char values = vec_xl( 0, kvalues_iq4nl); + +#pragma GCC unroll 4 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); + vector float vd = vec_mul(vxd, vyd); + + vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); + vector signed char q4x0 = vec_and(qxs, lowMask); + vector signed char q4x1 = vec_sr(qxs, v4); + + q4x0 = vec_perm(values, values, (vector unsigned char)q4x0); + q4x1 = vec_perm(values, values, (vector unsigned char)q4x1); + + vector signed char q8y0 = vec_xl( 0, y[ib].qs); + vector signed char q8y1 = vec_xl(16, y[ib].qs); + + vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + + vsumi0 = vec_sum4s(qv0, vsumi0); + vsumi1 = vec_sum4s(qv1, vsumi1); + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + } + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + sumf = vec_extract(vsumf0, 0); + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +} + +void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK_K == 0); + + const block_iq4_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__POWER9_VECTOR__) + const vector signed char lowMask = vec_splats((signed char)0xF); + const vector int v0 = vec_splats((int32_t)0); + const vector unsigned char v4 = vec_splats((unsigned char)0x4); + + vector float vsumf0 = vec_splats(0.0f); + vector float vsumf1 = vec_splats(0.0f); + vector float vsumf2 = vec_splats(0.0f); + vector float vsumf3 = vec_splats(0.0f); + + const vector signed char values = vec_xl( 0, kvalues_iq4nl); + + for (int ibl = 0; ibl < nb; ++ibl) { + + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d)); + vector float vyd = vec_splats(y[ibl].d); + vector float vd = vec_mul(vxd, vyd); + + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + + uint16_t h = x[ibl].scales_h; + + const uint8_t * GGML_RESTRICT q4 = x[ibl].qs; + const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l; + const int8_t * GGML_RESTRICT q8 = y[ibl].qs; + + for (int ib = 0; ib < QK_K/64; ib ++ ) { + __builtin_prefetch(q4, 0, 1); + __builtin_prefetch(q8, 0, 1); + + vector signed char qxs0 = (vector signed char)vec_xl( 0, q4); + vector signed char qxs1 = (vector signed char)vec_xl(16, q4); + q4 += 32; + + vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask); + vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4); + vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask); + vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4); + + q4x00 = vec_perm(values, values, (vector unsigned char)q4x00); + q4x01 = vec_perm(values, values, (vector unsigned char)q4x01); + q4x10 = vec_perm(values, values, (vector unsigned char)q4x10); + q4x11 = vec_perm(values, values, (vector unsigned char)q4x11); + + vector signed char q8y0 = vec_xl( 0, q8); + vector signed char q8y1 = vec_xl(16, q8); + vector signed char q8y2 = vec_xl(32, q8); + vector signed char q8y3 = vec_xl(48, q8); + q8 += 64; + + vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0)); + vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1)); + vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2)); + vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3)); + + const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32); + const uint16_t ls1 = (uint16_t)(((sc[0] >> 4) | ((h << 2) & 0x30)) - 32); + h >>= 4; + sc ++; + + vector signed short vscales01 = vec_splats((int16_t)ls0); + vector signed short vscales23 = vec_splats((int16_t)ls1); + + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); + } + + vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); + vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); + vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); + vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); + } + + vsumf0 = vec_add(vsumf0, vsumf2); + vsumf1 = vec_add(vsumf1, vsumf3); + + vsumf0 = vec_add(vsumf0, vsumf1); + + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); + vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); + + *s = vec_extract(vsumf0, 0); + +#else + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + uint16_t h = x[ibl].scales_h; + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + for (int ib = 0; ib < QK_K/32; ib += 2) { + const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30); + const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30); + h >>= 4; + const float d1 = d4d8*(ls1 - 32); + const float d2 = d4d8*(ls2 - 32); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d1 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + sumi1 = sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d2 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + } + } + *s = sumf; +#endif +} + diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c new file mode 100644 index 0000000000000..8b64d8adc48f4 --- /dev/null +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -0,0 +1,2069 @@ +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" +#include "ggml-quants.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "simd-mappings.h" + +#include "../../quants.h" +#include "../../ggml-cpu-impl.h" + +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#define UNUSED GGML_UNUSED + +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__riscv_v) + + size_t vl = QK8_0; + + for (int i = 0; i < nb; i++) { + // load elements + vfloat32m8_t v_x = __riscv_vle32_v_f32m8(x+i*QK8_0, vl); + + vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl); + vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl); + vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl); + float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl); + + // convert to integer + vint16m4_t vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl); + vint8m2_t vs = __riscv_vncvt_x_x_w_i8m2(vi, vl); + + // store result + __riscv_vse8_v_i8m2(y[i].qs , vs, vl); + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_ref(x, y, k); +#endif +} + +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__riscv_v) + + size_t vl = QK8_1; + + for (int i = 0; i < nb; i++) { + // load elements + vfloat32m8_t v_x = __riscv_vle32_v_f32m8(x+i*QK8_1, vl); + + vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl); + vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl); + vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl); + float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl); + + // convert to integer + vint16m4_t vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl); + vint8m2_t vs = __riscv_vncvt_x_x_w_i8m2(vi, vl); + + // store result + __riscv_vse8_v_i8m2(y[i].qs , vs, vl); + + // compute sum for y[i].s + vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl); + vint16m1_t vwrs = __riscv_vwredsum_vs_i8m2_i16m1(vs, tmp2, vl); + + // set y[i].s + int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); + y[i].s = GGML_CPU_FP32_TO_FP16(sum*d); + } + +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_1_ref(x, y, k); +#endif +} + +//===================================== Dot products ================================= + +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__riscv_v) + size_t vl = qk / 2; + + for (; ib < nb; ++ib) { + // load elements + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl); + + // mask and store lower part of x, and then upper part + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + // subtract offset + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; +} + +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__riscv_v) + size_t vl = qk / 2; + + for (; ib < nb; ++ib) { + // load elements + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl); + + // mask and store lower part of x, and then upper part + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F); + const int v1 = (x[ib].qs[j] >> 4); + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__riscv_v) + size_t vl; + size_t vlenb = __riscv_vlenb(); + + for (; ib < nb; ++ib) { + vl = qk / 2; + vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl); + vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl)); + vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl)); + vint8m2_t v0c; + if (vlenb == 16) { + v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h); + } else { + v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32); + v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l); + } + + vl = qk; + vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl); + qh = __riscv_vmnand_mm_b4(qh, qh, vl); + vint8m2_t v0f = __riscv_vsub_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl); + vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl); + vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl); + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl); + int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum); + + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; + } + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; + } + + *s = sumf; +} + +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__riscv_v) + size_t vl; + size_t vlenb = __riscv_vlenb(); + + for (; ib < nb; ++ib) { + vl = qk / 2; + vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl); + vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl)); + vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl)); + vint8m2_t v0c; + if (vlenb == 16) { + v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h); + } else { + v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32); + v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l); + } + + vl = qk; + vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl); + vint8m2_t v0f = __riscv_vor_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl); + vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl); + vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl); + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl); + int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum); + + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__riscv_v) + size_t vl = qk; + + for (; ib < nb; ++ib) { + // load elements + vint8m2_t bx_0 = __riscv_vle8_v_i8m2(x[ib].qs, vl); + vint8m2_t by_0 = __riscv_vle8_v_i8m2(y[ib].qs, vl); + + vint16m4_t vw_mul = __riscv_vwmul_vv_i16m4(bx_0, by_0, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m4_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + +#endif + for (; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j]*y[ib].qs[j]; + } + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __riscv_xtheadvector + + float sumf = 0; + uint8_t atmp[16]; + + for (int i = 0; i < nb; ++i) { + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + uint8_t *patmp = atmp; + int vsums; + int tmp; + __asm__ __volatile__( + "th.vsetvli zero, %[vl16], e8, m1\n\t" + "th.vmv.v.x v8, zero\n\t" + "th.vlb.v v1, (%[sc])\n\t" + "th.vand.vi v0, v1, 0xF\n\t" + "th.vsrl.vi v1, v1, 4\n\t" + "th.vsb.v v0, (%[scale])\n\t" + "th.vwaddu.vx v16, v1, zero\n\t" + "th.vsetvli zero, %[vl16], e16, m2\n\t" + "th.vlh.v v2, (%[bsums])\n\t" + "th.vwmul.vv v4, v16, v2\n\t" + "th.vsetvli zero, %[vl16], e32, m4\n\t" + "th.vredsum.vs v8, v4, v8\n\t" + "th.vmv.x.s %[vsums], v8" + : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums) + : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums) + , [vl16] "r" (16) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + sumf += dmin * vsums; + int isum = 0; + + for (int j = 0; j < QK_K/128; ++j) { + __asm__ __volatile__( + "th.vsetvli zero, %[vl32], e8, m2\n\t" + "th.vlb.v v0, (%[q2])\n\t" + "th.vsrl.vi v2, v0, 2\n\t" + "th.vsrl.vi v4, v0, 4\n\t" + "th.vsrl.vi v6, v0, 6\n\t" + "th.vand.vi v0, v0, 0x3\n\t" + "th.vand.vi v2, v2, 0x3\n\t" + "th.vand.vi v4, v4, 0x3\n\t" + "th.vsetvli zero, %[vl128], e8, m8\n\t" + "th.vlb.v v8, (%[q8])\n\t" + "th.vsetvli zero, %[vl64], e8, m4\n\t" + "th.vwmul.vv v16, v0, v8\n\t" + "th.vwmul.vv v24, v4, v12\n\t" + "th.vsetvli zero, %[vl16], e16, m2\n\t" + "th.vmv.v.x v0, zero\n\t" + "th.vwredsum.vs v10, v16, v0\n\t" + "th.vwredsum.vs v9, v18, v0\n\t" + "th.vwredsum.vs v8, v20, v0\n\t" + "th.vwredsum.vs v7, v22, v0\n\t" + "th.vwredsum.vs v11, v24, v0\n\t" + "th.vwredsum.vs v12, v26, v0\n\t" + "th.vwredsum.vs v13, v28, v0\n\t" + "th.vwredsum.vs v14, v30, v0\n\t" + "li %[tmp], 4\n\t" + "th.vsetvli zero, %[tmp], e32, m1\n\t" + "th.vslideup.vi v10, v9, 1\n\t" + "th.vslideup.vi v8, v7, 1\n\t" + "th.vslideup.vi v11, v12, 1\n\t" + "th.vslideup.vi v13, v14, 1\n\t" + "th.vslideup.vi v10, v8, 2\n\t" + "th.vslideup.vi v11, v13, 2\n\t" + "li %[tmp], 8\n\t" + "th.vsetvli zero, %[tmp], e32, m2\n\t" + "th.vlbu.v v12, (%[scale])\n\t" + "th.vmul.vv v10, v10, v12\n\t" + "th.vredsum.vs v0, v10, v0\n\t" + "th.vmv.x.s %[tmp], v0\n\t" + "add %[isum], %[isum], %[tmp]" + : [tmp] "=&r" (tmp), [isum] "+&r" (isum) + : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8) + , [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + q2 += 32; q8 += 128; patmp += 8; + } + + sumf += dall * isum; + } + + *s = sumf; + +#elif defined __riscv_v + + float sumf = 0; + uint8_t atmp[16]; + + const int vector_length = __riscv_vlenb() * 8; + uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + + switch (vector_length) { + case 256: + for (int i = 0; i < nb; ++i) { + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + size_t vl = 16; + + vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl); + vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl); + + vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl); + + vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl); + vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl); + vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl)); + vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl); + vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); + + sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums); + + vl = 32; + + vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); + vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl); + + uint8_t is = 0; + int isum = 0; + + for (int j = 0; j < QK_K / 128; ++j) { + // load Q2 + vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl); + + vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl); + vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03, vl); + vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03, vl); + vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03, vl); + + // duplicate scale elements for product + vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0 + is, vl), vl); + vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2 + is, vl), vl); + vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4 + is, vl), vl); + vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6 + is, vl), vl); + + vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl)); + vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl)); + vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl)); + vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl)); + + // load Q8 + vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl); + vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8 + 32, vl); + vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8 + 64, vl); + vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8 + 96, vl); + + vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl); + vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl); + vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl); + vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl); + + vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl); + vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl); + + isum += __riscv_vmv_x_s_i32m1_i32(isum1); + + q2 += 32; + q8 += 128; + is = 8; + } + + sumf += dall * isum; + } + break; + case 128: + for (int i = 0; i < nb; ++i) { + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + uint8_t *patmp = atmp; + int vsums; + int tmp; + __asm__ __volatile__( + "vsetivli zero, 16, e8, m1\n\t" + "vmv.v.x v8, zero\n\t" + "vle8.v v1, (%[sc])\n\t" + "vand.vi v0, v1, 0xF\n\t" + "vsrl.vi v1, v1, 4\n\t" + "vse8.v v0, (%[scale])\n\t" + "vsetivli zero, 16, e16, m2\n\t" + "vle16.v v2, (%[bsums])\n\t" + "vzext.vf2 v0, v1\n\t" + "vwmul.vv v4, v0, v2\n\t" + "vsetivli zero, 16, e32, m4\n\t" + "vredsum.vs v8, v4, v8\n\t" + "vmv.x.s %[vsums], v8" + : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums) + : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + sumf += dmin * vsums; + int isum = 0; + + for (int j = 0; j < QK_K/128; ++j) { + __asm__ __volatile__( + "vsetvli zero, %[vl32], e8, m2\n\t" + "vle8.v v0, (%[q2])\n\t" + "vsrl.vi v2, v0, 2\n\t" + "vsrl.vi v4, v0, 4\n\t" + "vsrl.vi v6, v0, 6\n\t" + "vand.vi v0, v0, 0x3\n\t" + "vand.vi v2, v2, 0x3\n\t" + "vand.vi v4, v4, 0x3\n\t" + "vsetvli zero, %[vl128], e8, m8\n\t" + "vle8.v v8, (%[q8])\n\t" + "vsetvli zero, %[vl64], e8, m4\n\t" + "vwmul.vv v16, v0, v8\n\t" + "vwmul.vv v24, v4, v12\n\t" + "vsetivli zero, 16, e16, m2\n\t" + "vmv.v.x v0, zero\n\t" + "vwredsum.vs v10, v16, v0\n\t" + "vwredsum.vs v9, v18, v0\n\t" + "vwredsum.vs v8, v20, v0\n\t" + "vwredsum.vs v7, v22, v0\n\t" + "vwredsum.vs v11, v24, v0\n\t" + "vwredsum.vs v12, v26, v0\n\t" + "vwredsum.vs v13, v28, v0\n\t" + "vwredsum.vs v14, v30, v0\n\t" + "vsetivli zero, 4, e32, m1\n\t" + "vslideup.vi v10, v9, 1\n\t" + "vslideup.vi v8, v7, 1\n\t" + "vslideup.vi v11, v12, 1\n\t" + "vslideup.vi v13, v14, 1\n\t" + "vslideup.vi v10, v8, 2\n\t" + "vslideup.vi v11, v13, 2\n\t" + "vsetivli zero, 8, e32, m2\n\t" + "vle8.v v15, (%[scale])\n\t" + "vzext.vf4 v12, v15\n\t" + "vmul.vv v10, v10, v12\n\t" + "vredsum.vs v0, v10, v0\n\t" + "vmv.x.s %[tmp], v0\n\t" + "add %[isum], %[isum], %[tmp]" + : [tmp] "=&r" (tmp), [isum] "+&r" (isum) + : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8) + , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + q2 += 32; q8 += 128; patmp += 8; + } + + sumf += dall * isum; + } + break; + default: + assert(false && "Unsupported vector length"); + break; + } + + *s = sumf; + +#else + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int isum = 0; + int is = 0; + int d; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + d = sc[is++] & 0xF; + int isuml = 0; + for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + d = sc[is++] & 0xF; + isuml = 0; + for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * isum - dmin * summs; + } + *s = sumf; +#endif +} + +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __riscv_xtheadvector + + uint32_t utmp[4]; + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict qh = x[i].hmask; + const int8_t * restrict q8 = y[i].qs; + + int8_t * scale = (int8_t *)utmp; + int tmp; + __asm__ __volatile__( + "li %[tmp], 12\n\t" + "th.vsetvli zero, %[tmp], e8, m1\n\t" + "th.vlb.v v0, (%[s6b])\n\t" + "th.vmv.v.v v2, v0\n\t" + "li %[tmp], 2\n\t" + "th.vsetvli zero, %[tmp], e64, m1\n\t" + "th.vmv.v.x v9, %[sh]\n\t"\ + "th.vslidedown.vi v1, v0, 1\n\t" + "th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4} + "th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]} + "li %[tmp], 4\n\t" + "th.vsetvli zero, %[tmp], e32, m1\n\t" + "th.vid.v v9\n\t" + "th.vmv.x.s %[tmp], v1\n\t" + "th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6} + "th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]} + "th.vsrl.vv v4, v1, v9\n\t" + "th.vsrl.vv v2, v0, v8\n\t" + "th.vand.vx v5, v4, %[kmask1]\n\t" + "th.vand.vx v3, v2, %[kmask2]\n\t" + "th.vsll.vi v6, v5, 4\n\t" + "th.vor.vv v7, v6, v3\n\t" + "li %[tmp], 16\n\t" + "th.vsetvli zero, %[tmp], e8, m1\n\t" + "th.vsub.vx v0, v7, %[c]\n\t" + "th.vsb.v v0, (%[scale])" + : [tmp] "=&r" (tmp) + : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32) + , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + + uint8_t m = 1; + int isum = 0; + for (int j = 0; j < QK_K; j += 128) { + __asm__ __volatile__( + // fixme: use v0p7 mask layout directly + "th.vsetvli zero, %[vl32], e8, m2\n\t" + "th.vlb.v v8, (%[q3])\n\t" + "th.vsrl.vi v10, v8, 2\n\t" + "th.vsrl.vi v12, v8, 4\n\t" + "th.vsrl.vi v14, v8, 6\n\t" + "th.vand.vi v8, v8, 3\n\t" + "th.vand.vi v10, v10, 3\n\t" + "th.vand.vi v12, v12, 3\n\t" + "th.vlb.v v2, (%[qh])\n\t" + "th.vand.vx v4, v2, %[m]\n\t" + "slli %[m], %[m], 1\n\t" + "th.vmseq.vx v0, v4, zero\n\t" + "th.vadd.vi v8, v8, -4, v0.t\n\t" + "th.vand.vx v4, v2, %[m]\n\t" + "slli %[m], %[m], 1\n\t" + "th.vmseq.vx v0, v4, zero\n\t" + "th.vadd.vi v10, v10, -4, v0.t\n\t" + "th.vand.vx v4, v2, %[m]\n\t" + "slli %[m], %[m], 1\n\t" + "th.vmseq.vx v0, v4, zero\n\t" + "th.vadd.vi v12, v12, -4, v0.t\n\t" + "th.vand.vx v4, v2, %[m]\n\t" + "slli %[m], %[m], 1\n\t" + "th.vmseq.vx v0, v4, zero\n\t" + "th.vadd.vi v14, v14, -4, v0.t\n\t" + "th.vsetvli zero, %[vl128], e8, m8\n\t" + "th.vlb.v v0, (%[q8])\n\t" + "th.vsetvli zero, %[vl64], e8, m4\n\t" + "th.vwmul.vv v16, v0, v8\n\t" + "th.vwmul.vv v24, v4, v12\n\t" + "li %[tmp], 16\n\t" + "th.vsetvli zero, %[tmp], e16, m2\n\t" + "th.vmv.v.x v0, zero\n\t" + "th.vwredsum.vs v10, v16, v0\n\t" + "th.vwredsum.vs v9, v18, v0\n\t" + "th.vwredsum.vs v8, v20, v0\n\t" + "th.vwredsum.vs v7, v22, v0\n\t" + "th.vwredsum.vs v11, v24, v0\n\t" + "th.vwredsum.vs v12, v26, v0\n\t" + "th.vwredsum.vs v13, v28, v0\n\t" + "th.vwredsum.vs v14, v30, v0\n\t" + "li %[tmp], 4\n\t" + "th.vsetvli zero, %[tmp], e32, m1\n\t" + "th.vslideup.vi v10, v9, 1\n\t" + "th.vslideup.vi v8, v7, 1\n\t" + "th.vslideup.vi v11, v12, 1\n\t" + "th.vslideup.vi v13, v14, 1\n\t" + "th.vslideup.vi v10, v8, 2\n\t" + "th.vslideup.vi v11, v13, 2\n\t" + "li %[tmp], 8\n\t" + "th.vsetvli zero, %[tmp], e32, m2\n\t" + "th.vlb.v v12, (%[scale])\n\t" + "th.vmul.vv v10, v10, v12\n\t" + "th.vredsum.vs v0, v10, v0\n\t" + "th.vmv.x.s %[tmp], v0\n\t" + "add %[isum], %[isum], %[tmp]" + : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum) + : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32) + , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + q3 += 32; q8 += 128; scale += 8; + } + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + sumf += d * isum; + } + + *s = sumf; + +#elif defined __riscv_v + + uint32_t utmp[4]; + float sumf = 0; + uint32_t aux[3]; + const int vector_length = __riscv_vlenb() * 8; + + switch (vector_length) { + case 256: + for (int i = 0; i < nb; ++i) { + + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(aux, x[i].scales, 12); + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + int8_t * scale = (int8_t *)utmp; + for (int j = 0; j < 16; ++j) scale[j] -= 32; + + + size_t vl = 32; + uint8_t m = 1; + + vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); + vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl); + + int sum_t = 0; + + for (int j = 0; j < QK_K; j += 128) { + + vl = 32; + + // load Q3 + vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl); + + vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl)); + vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl)); + vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl)); + vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl)); + + // compute mask for subtraction + vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl); + vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl); + vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl); + m <<= 1; + + vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); + vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl); + vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl); + m <<= 1; + + vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); + vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl); + vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl); + m <<= 1; + + vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl); + vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl); + vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl); + m <<= 1; + + // load Q8 and take product with Q3 + vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl); + vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl); + vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl); + vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl); + + vl = 16; + + // retrieve lane to multiply with scale + vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl); + vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl); + vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl); + vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl); + vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl); + vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl); + vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl); + vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl); + + vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl); + vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl); + vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl); + vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl); + + sum_t += __riscv_vmv_x_s_i32m1_i32(isum3); + + q3 += 32; q8 += 128; scale += 8; + + } + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + + sumf += d*sum_t; + + } + break; + case 128: + for (int i = 0; i < nb; ++i) { + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict qh = x[i].hmask; + const int8_t * restrict q8 = y[i].qs; + + int8_t * scale = (int8_t *)utmp; + int tmp; + __asm__ __volatile__( + "vsetivli zero, 12, e8, m1\n\t" + "vle8.v v0, (%[s6b])\n\t" + "vmv1r.v v2, v0\n\t" + "vsetivli zero, 2, e64, m1\n\t" + "vmv.v.x v9, %[sh]\n\t"\ + "vslidedown.vi v1, v0, 1\n\t" + "vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4} + "vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]} + "vsetivli zero, 4, e32, m1\n\t" + "vid.v v9\n\t" + "vmv.x.s %[tmp], v1\n\t" + "vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6} + "vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]} + "vsrl.vv v4, v1, v9\n\t" + "vsrl.vv v2, v0, v8\n\t" + "vand.vx v5, v4, %[kmask1]\n\t" + "vand.vx v3, v2, %[kmask2]\n\t" + "vsll.vi v6, v5, 4\n\t" + "vor.vv v7, v6, v3\n\t" + "vsetivli zero, 16, e8, m1\n\t" + "vsub.vx v0, v7, %[c]\n\t" + "vse8.v v0, (%[scale])" + : [tmp] "=&r" (tmp) + : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32) + , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + + uint8_t m = 1; + int isum = 0; + for (int j = 0; j < QK_K; j += 128) { + __asm__ __volatile__( + "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t" + "vle8.v v8, (%[q3])\n\t" + "vsrl.vi v10, v8, 2\n\t" + "vsrl.vi v12, v8, 4\n\t" + "vsrl.vi v14, v8, 6\n\t" + "vand.vi v8, v8, 3\n\t" + "vand.vi v10, v10, 3\n\t" + "vand.vi v12, v12, 3\n\t" + "vle8.v v2, (%[qh])\n\t" + "vand.vx v4, v2, %[m]\n\t" + "slli %[m], %[m], 1\n\t" + "vmseq.vx v0, v4, zero\n\t" + "vadd.vi v8, v8, -4, v0.t\n\t" + "vand.vx v4, v2, %[m]\n\t" + "slli %[m], %[m], 1\n\t" + "vmseq.vx v0, v4, zero\n\t" + "vadd.vi v10, v10, -4, v0.t\n\t" + "vand.vx v4, v2, %[m]\n\t" + "slli %[m], %[m], 1\n\t" + "vmseq.vx v0, v4, zero\n\t" + "vadd.vi v12, v12, -4, v0.t\n\t" + "vand.vx v4, v2, %[m]\n\t" + "slli %[m], %[m], 1\n\t" + "vmseq.vx v0, v4, zero\n\t" + "vadd.vi v14, v14, -4, v0.t\n\t" + "vsetvli zero, %[vl128], e8, m8\n\t" + "vle8.v v0, (%[q8])\n\t" + "vsetvli zero, %[vl64], e8, m4\n\t" + "vwmul.vv v16, v0, v8\n\t" + "vwmul.vv v24, v4, v12\n\t" + "vsetivli zero, 16, e16, m2\n\t" + "vmv.v.x v0, zero\n\t" + "vwredsum.vs v10, v16, v0\n\t" + "vwredsum.vs v9, v18, v0\n\t" + "vwredsum.vs v8, v20, v0\n\t" + "vwredsum.vs v7, v22, v0\n\t" + "vwredsum.vs v11, v24, v0\n\t" + "vwredsum.vs v12, v26, v0\n\t" + "vwredsum.vs v13, v28, v0\n\t" + "vwredsum.vs v14, v30, v0\n\t" + "vsetivli zero, 4, e32, m1\n\t" + "vslideup.vi v10, v9, 1\n\t" + "vslideup.vi v8, v7, 1\n\t" + "vslideup.vi v11, v12, 1\n\t" + "vslideup.vi v13, v14, 1\n\t" + "vslideup.vi v10, v8, 2\n\t" + "vslideup.vi v11, v13, 2\n\t" + "vsetivli zero, 8, e32, m2\n\t" + "vle8.v v15, (%[scale])\n\t" + "vsext.vf4 v12, v15\n\t" + "vmul.vv v10, v10, v12\n\t" + "vredsum.vs v0, v10, v0\n\t" + "vmv.x.s %[tmp], v0\n\t" + "add %[isum], %[isum], %[tmp]" + : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum) + : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32) + , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + q3 += 32; q8 += 128; scale += 8; + } + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + sumf += d * isum; + } + break; + default: + assert(false && "Unsupported vector length"); + break; + } + + *s = sumf; + +#else + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; + + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +#endif + +} + +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined __riscv_xtheadvector + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int tmp, tmp2, sumi; + __asm__ __volatile__( + "li %[t1], 12\n\t" + "th.vsetvli zero, %[t1], e8, m1\n\t" + "th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]} + "li %[t1], 4\n\t" + "th.vsetvli zero, %[t1], e32, m1\n\t" + "th.vslidedown.vi v2, v1, 2\n\t" + "th.vmv.v.v v3, v2\n\t" + "th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]} + "li %[t1], 2\n\t" + "th.vsetvli zero, %[t1], e32, m1\n\t" + "th.vmv.v.i v4, 4\n\t" + "th.vand.vx v8, v1, %[kmask1]\n\t" + "th.vslide1up.vx v5, v4, zero\n\t" // {0, 4} + "th.vsrl.vi v6, v1, 6\n\t" + "th.vsrl.vv v7, v2, v5\n\t" + "th.vand.vx v0, v6, %[kmask3]\n\t" + "th.vand.vx v2, v7, %[kmask2]\n\t" + "th.vsll.vi v6, v0, 4\n\t" + "li %[t2], 8\n\t" + "addi %[t1], %[utmp], 4\n\t" + "th.vor.vv v1, v6, v2\n\t" + "th.vssw.v v8, (%[utmp]), %[t2]\n\t" + "th.vssw.v v1, (%[t1]), %[t2]\n\t" + "th.vsetvli zero, zero, e32, m2\n\t" // vl == 8 + "th.vlw.v v2, (%[bsums])\n\t" + "th.vsetvli zero, %[t2], e16, m1\n\t" + "th.vnsrl.vi v0, v2, 0\n\t" + "th.vnsrl.vi v1, v2, 16\n\t" + "th.vadd.vv v2, v0, v1\n\t" + "th.vlbu.v v4, (%[mins])\n\t" + "th.vwmul.vv v6, v4, v2\n\t" + "th.vmv.v.x v0, zero\n\t" + "th.vsetvli zero, %[t2], e32, m2\n\t" + "th.vredsum.vs v0, v6, v0\n\t" + "th.vmv.x.s %[sumi], v0" + : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi) + : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp) + , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1) + , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + sumf -= dmin * sumi; + + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + sumi = 0; + const uint8_t * scale = scales; + + for (int j = 0; j < QK_K/128; ++j) { + int vl128 = 128, vl64 = 64, vl32 = 32; + __asm__ __volatile__( + "th.vsetvli zero, %[vl128], e8, m8\n\t" + "th.vlb.v v8, (%[q8])\n\t" + "th.vsetvli zero, %[vl64], e8, m4\n\t" + "th.vlb.v v0, (%[q4])\n\t" + "th.vsrl.vi v4, v0, 4\n\t" + "th.vand.vi v0, v0, 0xF\n\t" + "th.vsetvli zero, %[vl32], e8, m2\n\t" + "th.vwmul.vv v28, v6, v14\n\t" + "th.vwmul.vv v20, v4, v10\n\t" + "th.vwmul.vv v24, v2, v12\n\t" + "th.vwmul.vv v16, v0, v8\n\t" + "li %[tmp], 4\n\t" + "th.vsetvli zero, %[tmp], e32, m1\n\t" + "th.vlbu.v v1, (%[scale])\n\t" + "th.vmv.v.x v0, zero\n\t" + "th.vsetvli zero, %[vl32], e16, m4\n\t" + "th.vwredsum.vs v6, v24, v0\n\t" + "th.vwredsum.vs v7, v28, v0\n\t" + "th.vwredsum.vs v4, v16, v0\n\t" + "th.vwredsum.vs v5, v20, v0\n\t" + "th.vsetvli zero, %[tmp], e32, m1\n\t" + "th.vslideup.vi v6, v7, 1\n\t" + "th.vslideup.vi v4, v5, 1\n\t" + "th.vslideup.vi v4, v6, 2\n\t" + "th.vmul.vv v8, v4, v1\n\t" + "th.vredsum.vs v0, v8, v0\n\t" + "th.vmv.x.s %[tmp], v0\n\t" + "add %[sumi], %[sumi], %[tmp]" + : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi) + : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32) + , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + + q4 += 64; q8 += 128; scale += 4; + } + + sumf += d * sumi; + + } + + *s = sumf; + +#elif defined __riscv_v + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + float sumf = 0; + const int vector_length = __riscv_vlenb() * 8; + + switch (vector_length) { + case 256: + for (int i = 0; i < nb; ++i) { + + size_t vl = 8; + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); + vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); + vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl); + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl); + vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl)); + vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl); + + vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); + sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); + + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + vl = 32; + + int32_t sum_1 = 0; + int32_t sum_2 = 0; + + vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1); + + for (int j = 0; j < QK_K/64; ++j) { + // load Q4 + vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl); + + // load Q8 and multiply it with lower Q4 nibble + vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl); + vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl)); + vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl); + vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl); + + sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0]; + + // load Q8 and multiply it with upper Q4 nibble + vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl); + vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl)); + vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl); + vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl); + + sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1]; + + q4 += 32; q8 += 64; + + } + + sumf += d*(sum_1 + sum_2); + + } + break; + case 128: + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int tmp, tmp2, sumi; + __asm__ __volatile__( + "vsetivli zero, 12, e8, m1\n\t" + "vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]} + "vsetivli zero, 4, e32, m1\n\t" + "vslidedown.vi v2, v1, 2\n\t" + "vmv1r.v v3, v2\n\t" + "vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]} + "vsetivli zero, 2, e32, m1\n\t" + "vmv.v.i v4, 4\n\t" + "vand.vx v8, v1, %[kmask1]\n\t" + "vslide1up.vx v5, v4, zero\n\t" // {0, 4} + "vsrl.vi v6, v1, 6\n\t" + "vsrl.vv v7, v2, v5\n\t" + "vand.vx v0, v6, %[kmask3]\n\t" + "vand.vx v2, v7, %[kmask2]\n\t" + "vsll.vi v6, v0, 4\n\t" + "li %[t2], 8\n\t" + "addi %[t1], %[utmp], 4\n\t" + "vor.vv v1, v6, v2\n\t" + "vsse32.v v8, (%[utmp]), %[t2]\n\t" + "vsse32.v v1, (%[t1]), %[t2]\n\t" + "vsetivli zero, 8, e16, m1\n\t" + "vle32.v v2, (%[bsums])\n\t" + "vnsrl.wi v0, v2, 0\n\t" + "vnsrl.wi v1, v2, 16\n\t" + "vadd.vv v2, v0, v1\n\t" + "vle8.v v3, (%[mins])\n\t" + "vzext.vf2 v4, v3\n\t" + "vwmul.vv v6, v4, v2\n\t" + "vmv.v.x v0, zero\n\t" + "vsetivli zero, 8, e32, m2\n\t" + "vredsum.vs v0, v6, v0\n\t" + "vmv.x.s %[sumi], v0" + : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi) + : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp) + , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1) + , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + sumf -= dmin * sumi; + + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + sumi = 0; + const uint8_t * scale = scales; + + for (int j = 0; j < QK_K/128; ++j) { + int vl128 = 128, vl64 = 64, vl32 = 32; + __asm__ __volatile__( + "vsetvli zero, %[vl128], e8, m8\n\t" + "vle8.v v8, (%[q8])\n\t" + "vsetvli zero, %[vl64], e8, m4\n\t" + "vle8.v v0, (%[q4])\n\t" + "vsrl.vi v4, v0, 4\n\t" + "vand.vi v0, v0, 0xF\n\t" + "vsetvli zero, %[vl32], e8, m2\n\t" + "vwmul.vv v28, v6, v14\n\t" + "vwmul.vv v20, v4, v10\n\t" + "vwmul.vv v24, v2, v12\n\t" + "vwmul.vv v16, v0, v8\n\t" + "vsetivli zero, 4, e32, m1\n\t" + "vle8.v v2, (%[scale])\n\t" + "vmv.v.x v0, zero\n\t" + "vzext.vf4 v1, v2\n\t" + "vsetvli zero, %[vl32], e16, m4\n\t" + "vwredsum.vs v6, v24, v0\n\t" + "vwredsum.vs v7, v28, v0\n\t" + "vwredsum.vs v4, v16, v0\n\t" + "vwredsum.vs v5, v20, v0\n\t" + "vsetivli zero, 4, e32, m1\n\t" + "vslideup.vi v6, v7, 1\n\t" + "vslideup.vi v4, v5, 1\n\t" + "vslideup.vi v4, v6, 2\n\t" + "vmul.vv v8, v4, v1\n\t" + "vredsum.vs v0, v8, v0\n\t" + "vmv.x.s %[tmp], v0\n\t" + "add %[sumi], %[sumi], %[tmp]" + : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi) + : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32) + , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + + q4 += 64; q8 += 128; scale += 4; + } + + sumf += d * sumi; + } + break; + default: + assert(false && "Unsupported vector length"); + break; + } + + *s = sumf; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined __riscv_v + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + float sumf = 0; + float sums = 0.0; + + size_t vl; + + for (int i = 0; i < nb; ++i) { + + vl = 8; + + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + + vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl); + vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl); + vint16m1_t q8sums = __riscv_vadd_vv_i16m1(q8sums_0, q8sums_1, vl); + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + vuint8mf2_t mins8 = __riscv_vle8_v_u8mf2(mins, vl); + vint16m1_t v_mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl)); + vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, v_mins, vl); + + vint32m1_t sumi = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); + sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); + + vl = 32; + int32_t aux32 = 0; + int is = 0; + + uint8_t m = 1; + vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); + vuint8m2_t vqh = __riscv_vle8_v_u8m2(hm, vl); + + for (int j = 0; j < QK_K/64; ++j) { + // load Q5 and Q8 + vuint8m2_t q5_x = __riscv_vle8_v_u8m2(q5, vl); + vint8m2_t q8_y1 = __riscv_vle8_v_i8m2(q8, vl); + vint8m2_t q8_y2 = __riscv_vle8_v_i8m2(q8+32, vl); + + // compute mask for addition + vint8m2_t q5_a = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vx_u8m2(q5_x, 0x0F, vl)); + vuint8m2_t qh_m1 = __riscv_vand_vx_u8m2(vqh, m, vl); + vbool4_t vmask_1 = __riscv_vmsne_vx_u8m2_b4(qh_m1, 0, vl); + vint8m2_t q5_m1 = __riscv_vadd_vx_i8m2_mu(vmask_1, q5_a, q5_a, 16, vl); + m <<= 1; + + vint8m2_t q5_l = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vsrl_vx_u8m2(q5_x, 0x04, vl)); + vuint8m2_t qh_m2 = __riscv_vand_vx_u8m2(vqh, m, vl); + vbool4_t vmask_2 = __riscv_vmsne_vx_u8m2_b4(qh_m2, 0, vl); + vint8m2_t q5_m2 = __riscv_vadd_vx_i8m2_mu(vmask_2, q5_l, q5_l, 16, vl); + m <<= 1; + + vint16m4_t v0 = __riscv_vwmul_vv_i16m4(q5_m1, q8_y1, vl); + vint16m4_t v1 = __riscv_vwmul_vv_i16m4(q5_m2, q8_y2, vl); + + vint32m8_t vs1 = __riscv_vwmul_vx_i32m8(v0, scales[is++], vl); + vint32m8_t vs2 = __riscv_vwmul_vx_i32m8(v1, scales[is++], vl); + + vint32m1_t vacc1 = __riscv_vredsum_vs_i32m8_i32m1(vs1, vzero, vl); + vint32m1_t vacc2 = __riscv_vredsum_vs_i32m8_i32m1(vs2, vacc1, vl); + + aux32 += __riscv_vmv_x_s_i32m1_i32(vacc2); + q5 += 32; q8 += 64; + + } + + sums += aux32 * d; + + } + + *s = sumf+sums; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __riscv_xtheadvector + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + + const uint8_t * restrict q6 = x[i].ql; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + + const int8_t * restrict scale = x[i].scales; + + int sum_t = 0; + int t0; + + for (int j = 0; j < QK_K/128; ++j) { + __asm__ __volatile__( + "th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32 + "th.vlb.v v4, (%[qh])\n\t" + "th.vsll.vi v0, v4, 4\n\t" + "th.vsll.vi v2, v4, 2\n\t" + "th.vsrl.vi v6, v4, 2\n\t" + "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64 + "th.vlb.v v8, (%[q6])\n\t" + "th.vsrl.vi v12, v8, 4\n\t" + "th.vand.vi v8, v8, 0xF\n\t" + "th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128 + "th.vand.vx v0, v0, %[mask]\n\t" + "th.vor.vv v8, v8, v0\n\t" + "th.vlb.v v0, (%[q8])\n\t" + "th.vsub.vx v8, v8, %[vl32]\n\t" + "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64 + "th.vwmul.vv v16, v0, v8\n\t" + "th.vwmul.vv v24, v4, v12\n\t" + "li %[t0], 16\n\t" + "th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16 + "th.vmv.v.x v0, zero\n\t" + "th.vwredsum.vs v10, v16, v0\n\t" + "th.vwredsum.vs v9, v18, v0\n\t" + "th.vwredsum.vs v8, v20, v0\n\t" + "th.vwredsum.vs v7, v22, v0\n\t" + "th.vwredsum.vs v11, v24, v0\n\t" + "th.vwredsum.vs v12, v26, v0\n\t" + "th.vwredsum.vs v13, v28, v0\n\t" + "th.vwredsum.vs v14, v30, v0\n\t" + "li %[t0], 4\n\t" + "th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4 + "th.vslideup.vi v10, v9, 1\n\t" + "th.vslideup.vi v8, v7, 1\n\t" + "th.vslideup.vi v11, v12, 1\n\t" + "th.vslideup.vi v13, v14, 1\n\t" + "th.vslideup.vi v10, v8, 2\n\t" + "th.vslideup.vi v11, v13, 2\n\t" + "li %[t0], 8\n\t" + "th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8 + "th.vlb.v v4, (%[scale])\n\t" + "th.vmul.vv v2, v4, v10\n\t" + "th.vredsum.vs v0, v2, v0\n\t" + "th.vmv.x.s %[t0], v0\n\t" + "add %[sumi], %[sumi], %[t0]" + : [sumi] "+&r" (sum_t), [t0] "=&r" (t0) + : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale) + , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128) + , [mask] "r" (0x30) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + q6 += 64; qh += 32; q8 += 128; scale += 8; + } + + sumf += d * sum_t; + + } + + *s = sumf; + +#elif defined __riscv_v + + float sumf = 0; + const int vector_length = __riscv_vlenb() * 8; + + switch (vector_length) { + case 256: + for (int i = 0; i < nb; ++i) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const int8_t * GGML_RESTRICT scale = x[i].scales; + + size_t vl; + + vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); + + int sum_t = 0; + int is = 0; + + for (int j = 0; j < QK_K/128; ++j) { + + vl = 32; + + // load qh + vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl); + + // load Q6 + vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl); + vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl); + + vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl); + vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl); + vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl); + vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl); + + vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl); + vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl); + vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl); + vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl); + + vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl); + vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl); + vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl); + vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl); + + vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl); + vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl); + vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl); + vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl); + + // load Q8 and take product + vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl); + vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl); + vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl); + vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl); + + vl = 16; + + vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl); + vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl); + vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl); + vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl); + vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl); + vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl); + vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl); + vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl); + + vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl); + vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl); + vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl); + vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl); + + sum_t += __riscv_vmv_x_s_i32m1_i32(isum3); + + q6 += 64; qh += 32; q8 += 128; is=8; + + } + + sumf += d * sum_t; + + } + break; + case 128: + for (int i = 0; i < nb; ++i) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + + const uint8_t * restrict q6 = x[i].ql; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + + const int8_t * restrict scale = x[i].scales; + + int sum_t = 0; + int t0; + + for (int j = 0; j < QK_K/128; ++j) { + __asm__ __volatile__( + "vsetvli zero, %[vl32], e8, m2\n\t" + "vle8.v v4, (%[qh])\n\t" + "vsll.vi v0, v4, 4\n\t" + "vsll.vi v2, v4, 2\n\t" + "vsrl.vi v6, v4, 2\n\t" + "vsetvli zero, %[vl64], e8, m4\n\t" + "vle8.v v8, (%[q6])\n\t" + "vsrl.vi v12, v8, 4\n\t" + "vand.vi v8, v8, 0xF\n\t" + "vsetvli zero, %[vl128], e8, m8\n\t" + "vand.vx v0, v0, %[mask]\n\t" + "vor.vv v8, v8, v0\n\t" + "vle8.v v0, (%[q8])\n\t" + "vsub.vx v8, v8, %[vl32]\n\t" + "vsetvli zero, %[vl64], e8, m4\n\t" + "vwmul.vv v16, v0, v8\n\t" + "vwmul.vv v24, v4, v12\n\t" + "vsetivli zero, 16, e16, m2\n\t" + "vmv.v.x v0, zero\n\t" + "vwredsum.vs v10, v16, v0\n\t" + "vwredsum.vs v9, v18, v0\n\t" + "vwredsum.vs v8, v20, v0\n\t" + "vwredsum.vs v7, v22, v0\n\t" + "vwredsum.vs v11, v24, v0\n\t" + "vwredsum.vs v12, v26, v0\n\t" + "vwredsum.vs v13, v28, v0\n\t" + "vwredsum.vs v14, v30, v0\n\t" + "vsetivli zero, 4, e32, m1\n\t" + "vslideup.vi v10, v9, 1\n\t" + "vslideup.vi v8, v7, 1\n\t" + "vslideup.vi v11, v12, 1\n\t" + "vslideup.vi v13, v14, 1\n\t" + "vslideup.vi v10, v8, 2\n\t" + "vslideup.vi v11, v13, 2\n\t" + "vsetivli zero, 8, e32, m2\n\t" + "vle8.v v2, (%[scale])\n\t" + "vsext.vf4 v4, v2\n\t" + "vmul.vv v2, v4, v10\n\t" + "vredsum.vs v0, v2, v0\n\t" + "vmv.x.s %[t0], v0\n\t" + "add %[sumi], %[sumi], %[t0]" + : [sumi] "+&r" (sum_t), [t0] "=&r" (t0) + : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale) + , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128) + , [mask] "r" (0x30) + : "memory" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + q6 += 64; qh += 32; q8 += 128; scale += 8; + } + + sumf += d * sum_t; + + } + break; + default: + assert(false && "Unsupported vector length"); + break; + } + + *s = sumf; + +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp new file mode 100644 index 0000000000000..45c91a694820a --- /dev/null +++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp @@ -0,0 +1,397 @@ +#define GGML_COMMON_IMPL_CPP +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" +#include "ggml-backend-impl.h" + +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "ggml-cpu-impl.h" +#include "simd-mappings.h" +#include "traits.h" + +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GGML_CPU_CLANG_WORKAROUND +#include "../../repack.h" + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Woverlength-strings" +#endif + +#define UNUSED GGML_UNUSED + +void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined __riscv_v + if (__riscv_vlenb() >= QK4_0) { + const size_t vl = QK4_0; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + + vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + for (int l = 0; l < nb; l++) { + const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0]; + const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8]; + const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16]; + const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment constraints + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4)); + + const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4); + const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4); + const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4); + const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0); + const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1); + const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0); + const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1); + + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + // vector version needs Zvfhmin extension + const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + const float b_scales[8] = { + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7]) + }; + const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4); + sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4); + } + __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4); + } + return; + } + +#endif + { + float sumf[8]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } + } +} + +void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + +#if defined __riscv_v + if (__riscv_vlenb() >= QK4_0) { + const size_t vl = QK4_0; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); + for (int l = 0; l < nb; l++) { + const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4); + const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4); + const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4); + const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0); + const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1); + const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0); + const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1); + + // vector version needs Zvfhmin extension + const float a_scales[4] = { + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3]) + }; + const float b_scales[8] = { + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7]) + }; + const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); + + const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0]; + const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32]; + const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64]; + const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + vint16m4_t sumi_l0; + { + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4)); + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + sumi_l0 = sumi_hi_m; + } + + { + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4); + sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4); + } + + const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8]; + const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40]; + const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72]; + const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + vint16m4_t sumi_l1; + { + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4)); + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + sumi_l1 = sumi_hi_m; + } + + { + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4); + sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4); + } + + const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16]; + const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48]; + const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80]; + const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + vint16m4_t sumi_l2; + { + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4)); + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + sumi_l2 = sumi_hi_m; + } + + { + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4); + sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4); + } + + const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24]; + const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56]; + const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88]; + const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120]; + __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment + vint16m4_t sumi_l3; + { + const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4)); + const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4)); + const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4)); + const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4)); + const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); + const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); + const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); + const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); + + sumi_l3 = sumi_hi_m; + } + + { + const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3)); + const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); + const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); + const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); + const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); + const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); + const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); + const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); + const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); + const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); + const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); + const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); + const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); + + const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4); + sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4); + } + } + __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4); + __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4); + __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4); + __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4); + } + } + + return; + } + +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) + float sumf[4][8]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } +} diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c new file mode 100644 index 0000000000000..a840219a4fc08 --- /dev/null +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -0,0 +1,1300 @@ +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" +#include "ggml-quants.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "simd-mappings.h" + +#include "../../quants.h" +#include "../../ggml-cpu-impl.h" + +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#define UNUSED GGML_UNUSED + +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__VXE__) || defined(__VXE2__) + for (int i = 0; i < nb; i++) { + __vector float srcv [8]; + __vector float asrcv[8]; + __vector float amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); + for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(vec_extract(amaxv[0], 0), + vec_extract(amaxv[0], 1)), + MAX(vec_extract(amaxv[0], 2), + vec_extract(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const __vector float v = vec_mul(srcv[j], vec_splats(id)); + const __vector int32_t vi = vec_signed(v); + + y[i].qs[4*j + 0] = vec_extract(vi, 0); + y[i].qs[4*j + 1] = vec_extract(vi, 1); + y[i].qs[4*j + 2] = vec_extract(vi, 2); + y[i].qs[4*j + 3] = vec_extract(vi, 3); + } + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_ref(x, y, k); +#endif +} + +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__VXE__) || defined(__VXE2__) + for (int i = 0; i < nb; i++) { + __vector float srcv [8]; + __vector float asrcv[8]; + __vector float amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); + for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(vec_extract(amaxv[0], 0), + vec_extract(amaxv[0], 1)), + MAX(vec_extract(amaxv[0], 2), + vec_extract(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + __vector int32_t acc = vec_splats(0); + + for (int j = 0; j < 8; j++) { + const __vector float v = vec_mul(srcv[j], vec_splats(id)); + const __vector int32_t vi = vec_signed(v); + + y[i].qs[4*j + 0] = vec_extract(vi, 0); + y[i].qs[4*j + 1] = vec_extract(vi, 1); + y[i].qs[4*j + 2] = vec_extract(vi, 2); + y[i].qs[4*j + 3] = vec_extract(vi, 3); + + acc = vec_add(acc, vi); + } + + y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3])); + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_1_ref(x, y, k); +#endif +} + + +//===================================== Dot products ================================= + +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__VXE__) || defined(__VXE2__) + __vector float acc = vec_splats(0.0f); + + const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F); + const __vector int8_t v_s = vec_splats( (const int8_t)0x08); + + for (; ib < nb; ++ib) { + const __vector uint8_t v_x = vec_xl(0, x[ib].qs); + const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m); + const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4); + + const __vector int8_t v_xls = vec_sub(v_xl, v_s); + const __vector int8_t v_xhs = vec_sub(v_xh, v_s); + + const __vector int8_t v_yl = vec_xl(0 , y[ib].qs); + const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs); + + const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl); + const __vector int16_t v_xylse = vec_mule(v_xls, v_yl); + const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh); + const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh); + + __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_); + + const __vector float v_xy = vec_float(vec_unpackh(v_xy_)); + const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + + acc = vec_madd(v_xy, v_d, acc); + } + + sumf = acc[0] + acc[1] + acc[2] + acc[3]; + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; +} + +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__VXE__) || defined(__VXE2__) + float summs = 0; + float32x4_t acc = vec_splats(0.0f); + + const uint8x16_t v_m = vec_splat_u8(0x0F); + +#pragma GCC unroll 4 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + + const uint8x16_t v_x = vec_xl(0, x[ib].qs); + const int8x16_t v_xl = (const int8x16_t)(v_x & v_m); + const int8x16_t v_xh = (const int8x16_t)(v_x >> 4); + + const int8x16_t v_yl = vec_xl(0 , y[ib].qs); + const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs); + + const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); + const float32x4_t v_xy = vec_float(v_xy_); + + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + + acc = vec_madd(v_xy, v_d, acc); + } + + sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs; + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F); + const int v1 = (x[ib].qs[j] >> 4); + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__VXE__) || defined(__VXE2__) + __vector float acc = vec_splats(0.0f); + +#pragma GCC unroll 8 + for (; ib < nb; ++ib) { + __builtin_prefetch(x[ib].qs, 0, 1); + __builtin_prefetch(y[ib].qs, 0, 1); + + const int8x16_t v_xl = vec_xl(0 , x[ib].qs); + const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs); + const int8x16_t v_yl = vec_xl(0 , y[ib].qs); + const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs); + + const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); + const float32x4_t v_xy = vec_float(v_xy_); + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + + acc = vec_madd(v_xy, v_d, acc); + } + + sumf = acc[0] + acc[1] + acc[2] + acc[3]; + +#endif + for (; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j]*y[ib].qs[j]; + } + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__VXE__) || defined(__VXE2__) + uint32_t aux[3]; + uint32_t utmp[4]; + + const int32x4_t v_z = vec_splat_s32(0); + const uint8x16_t v_3m = vec_splat_u8(0x03); + + const uint8x16_t v_0c = vec_splat_u8(1); + const uint8x16_t v_1c = vec_sl(v_0c, 1); + const uint8x16_t v_2c = vec_sl(v_0c, 2); + const uint8x16_t v_3c = vec_sl(v_0c, 3); + + uint8x16_t q3h[4]; + uint8x16_t q3b[2]; + int8x16_t q3bytes[4]; + int8x16_t q8bytes[4]; + uint8x16_t qhbits[2]; + + float sum = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * restrict x0l = x[i].qs; + const uint8_t * restrict x0h = x[i].hmask; + const int8_t * restrict y0 = y[i].qs; + + qhbits[0] = vec_xl(0 , x0h); + qhbits[1] = vec_xl(16, x0h); + + int32_t isum = 0; + + memcpy(aux, x[i].scales, 12); + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + int8_t * scale = (int8_t *)utmp; + for (int j = 0; j < 16; ++j) scale[j] -= 32; + + for (int j = 0; j < QK_K/128; ++j) { + int32x4_t isum0, isum1, isum2, isum3; + + q3b[0] = vec_xl(0 , x0l); + q3b[1] = vec_xl(16, x0l); + x0l += 32; + + q8bytes[0] = vec_xl(0 , y0); + q8bytes[1] = vec_xl(16 , y0); + q8bytes[2] = vec_xl(32 , y0); + q8bytes[3] = vec_xl(48 , y0); + q8bytes[4] = vec_xl(64 , y0); + q8bytes[5] = vec_xl(80 , y0); + q8bytes[6] = vec_xl(96 , y0); + q8bytes[7] = vec_xl(112, y0); + y0 += 128; + + q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2); + q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2); + q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1); + q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1); + + q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]); + q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]); + q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]); + q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]); + + isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]); + isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]); + isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]); + isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]); + + isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0]; + isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1]; + isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2]; + isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3]; + + scale += 4; + + q3h[0] = vec_andc(v_2c, qhbits[0]); + q3h[1] = vec_andc(v_2c, qhbits[1]); + q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1); + q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1); + + q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]); + q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]); + q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]); + q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]); + + isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]); + isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]); + isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]); + isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]); + + isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0]; + isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1]; + isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2]; + isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3]; + + scale += 4; + + if (j == 0) { + qhbits[0] = vec_sr(qhbits[0], 4); + qhbits[1] = vec_sr(qhbits[1], 4); + } + } + + sum += d * isum; + } + + *s = sum; + +#else + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; + + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +#endif + +} + +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined(__VXE__) || defined(__VXE2__) + const uint8x16_t v_lm = vec_splat_u8(0x0F); + const int32x4_t v_z = vec_splat_s32(0); + + uint8x16_t v_x[2]; + int8x16_t v_xl[2]; + int8x16_t v_y[2]; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); + const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); + const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh); + + memcpy(utmp, x[i].scales, 12); + + uint32x4_t v_mins8 = { 0 }; + v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0); + v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1); + + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[0] &= kmask1; + + const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8); + + const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh); + const int32x4_t v_minse = vec_mule(v_ysums, v_minsh); + const int32x4_t v_mins = v_minso + v_minse; + sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]); + + const uint8_t * scales = (const uint8_t *)utmp; + const uint8_t * GGML_RESTRICT x0 = x[i].qs; + const int8_t * GGML_RESTRICT y0 = y[i].qs; + + int32_t sumi1 = 0; + int32_t sumi2 = 0; + + for (int j = 0; j < QK_K/64; ++j) { + v_x[0] = vec_xl(0 , x0); + v_x[1] = vec_xl(16, x0); + x0 += 32; + + v_y[0] = vec_xl(0 , y0); + v_y[1] = vec_xl(16, y0); + y0 += 32; + + v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm); + v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm); + + const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]); + sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0]; + + v_y[0] = vec_xl(0 , y0); + v_y[1] = vec_xl(16, y0); + y0 += 32; + + v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4); + v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4); + + const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]); + sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1]; + } + + sumf += d * (sumi1 + sumi2); + } + + *s = sumf; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined(__VXE__) || defined(__VXE2__) + const uint8x16_t v_lm = vec_splat_u8(0x0F); + const uint8x16_t v_1m = vec_splat_u8(0x01); + const uint8x16_t v_2m = vec_splat_u8(0x02); + + const int32x4_t v_z = vec_splat_s32(0); + + const uchar8x16_t v_minsm = { + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF + }; + + int8x16_t q5b[4]; + uint8x16_t q5h[4]; + + uint8x16_t v_xl[2]; + uint8x16_t v_xh[2]; + int8x16_t v_y[4]; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); + const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); + const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh); + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp); + const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm); + const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8); + + const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh); + const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh); + const int32x4_t v_mins = vec_add(v_minsho, v_minshe); + const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]; + + const uint8_t * scales = (const uint8_t *)utmp; + const uint8_t * GGML_RESTRICT x0l = x[i].qs; + const uint8_t * GGML_RESTRICT x0h = x[i].qh; + const int8_t * GGML_RESTRICT y0 = y[i].qs; + + v_xh[0] = vec_xl(0 , x0h); + v_xh[1] = vec_xl(16, x0h); + + int32_t sumi = 0; + for (int j = 0; j < QK_K/64; ++j) { + v_xl[0] = vec_xl(0 , x0l); + v_xl[1] = vec_xl(16, x0l); + x0l += 32; + + v_y[0] = vec_xl(0 , y0); + v_y[1] = vec_xl(16, y0); + v_y[2] = vec_xl(32, y0); + v_y[3] = vec_xl(48, y0); + y0 += 64; + + q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4); + q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4); + q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3); + q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3); + v_xh[0] = vec_sr(v_xh[0], 2); + v_xh[1] = vec_sr(v_xh[1], 2); + + q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]); + q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]); + q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]); + q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]); + + int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]); + int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]); + + sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++; + sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++; + } + + sumf += d * sumi - dmin * mins; + } + + *s = sumf; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__VXE__) || defined(__VXE2__) + float sum = 0; + + // Lower 4-bit and upper 2-bit masks + const uint8x16_t v_lm = vec_splat_u8(0x0F); + const uint8x16_t v_um = vec_splat_u8(0x03); + + const int32x4_t v_z = vec_splat_s32(0); + + int8x16_t q6b[4]; + uint8x16_t q6h[4]; + + uint8x16_t v_xl[4]; + uint8x16_t v_xh[2]; + int8x16_t v_y[4]; + + for (int i = 0; i < nb; ++i) { + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT x0l = x[i].ql; + const uint8_t * GGML_RESTRICT x0h = x[i].qh; + const int8_t * GGML_RESTRICT y0 = y[i].qs; + + const int8_t * GGML_RESTRICT scale = x[i].scales; + + const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); + const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); + + const int8x16_t v_scale = vec_xl(0, scale); + const int16x8_t v_scalel = vec_unpackh(v_scale); + const int16x8_t v_scaleh = vec_unpackl(v_scale); + + const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel); + const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel); + const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh); + const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh); + const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe; + + const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]; + + int32_t isum = 0; + for (int j = 0; j < QK_K/128; ++j) { + // Load model upper 2 bits + v_xh[0] = vec_xl(0 , x0h); + v_xh[1] = vec_xl(16, x0h); + x0h += 32; + + // Load model lower 4 bits + v_xl[0] = vec_xl(0 , x0l); + v_xl[1] = vec_xl(16, x0l); + v_xl[2] = vec_xl(32, x0l); + v_xl[3] = vec_xl(48, x0l); + x0l += 64; + + // Load activation quants + v_y[0] = vec_xl(0 , y0); + v_y[1] = vec_xl(16, y0); + v_y[2] = vec_xl(32, y0); + v_y[3] = vec_xl(48, y0); + y0 += 64; + + q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4); + q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4); + uint8x16_t shifted = vec_sr(v_xh[0], 2); + q6h[2] = vec_sl(vec_and(v_um, shifted), 4); + shifted = vec_sr(v_xh[1], 2); + q6h[3] = vec_sl(vec_and(v_um, shifted), 4); + + q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0])); + q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1])); + q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2])); + q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3])); + + int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]); + int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]); + int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]); + int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]); + + isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] + + (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] + + (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] + + (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3]; + + scale += 4; + + + // Load activation quants + v_y[0] = vec_xl(0 , y0); + v_y[1] = vec_xl(16, y0); + v_y[2] = vec_xl(32, y0); + v_y[3] = vec_xl(48, y0); + y0 += 64; + + shifted = vec_sr(v_xh[0], 4); + q6h[0] = vec_sl(vec_and(v_um, shifted), 4); + shifted = vec_sr(v_xh[1], 4); + q6h[1] = vec_sl(vec_and(v_um, shifted), 4); + shifted = vec_sr(v_xh[0], 6); + q6h[2] = vec_sl(vec_and(v_um, shifted), 4); + shifted = vec_sr(v_xh[1], 6); + q6h[3] = vec_sl(vec_and(v_um, shifted), 4); + + q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0])); + q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1])); + q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2])); + q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3])); + + summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]); + summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]); + summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]); + summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]); + + isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] + + (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] + + (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] + + (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3]; + + scale += 4; + } + + sum += d_all * y[i].d * (isum - 32 * mins); + } + + *s = sum; + +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +// #if defined(__VXE__) || defined(__VXE2__) +// static const int8_t keven_signs_q2xs[1024] = { +// 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, +// 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, +// 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, +// 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, +// 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, +// 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, +// 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, +// 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, +// 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, +// 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, +// 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, +// 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, +// 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, +// 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, +// 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, +// 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, +// 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, +// 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, +// 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, +// 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, +// 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, +// 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, +// 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, +// 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, +// 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, +// 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, +// 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, +// 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, +// 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, +// 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, +// 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, +// 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, +// }; +// #endif + +// void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +// assert(n % QK_K == 0); +// assert(nrc == 1); +// UNUSED(nrc); +// UNUSED(bx); +// UNUSED(by); +// UNUSED(bs); + +// const block_iq2_xxs * GGML_RESTRICT x = vx; +// const block_q8_K * GGML_RESTRICT y = vy; + +// const int nb = n / QK_K; + +// #if defined(__VXE__) || defined(__VXE2__) +// const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + +// uint32_t aux32[4]; +// const uint8_t * aux8 = (const uint8_t *)aux32; + +// float sumf = 0; + +// for (int i = 0; i < nb; ++i) { +// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; +// const uint16_t * GGML_RESTRICT q2 = x[i].qs; +// const int8_t * GGML_RESTRICT q8 = y[i].qs; + +// float sumf1 = 0, sumf2 = 0; + +// for (int ib32 = 0; ib32 < QK_K/32; ib += 2) { +// int8x16_t q8b0 = vec_xl( 0, q8); +// int8x16_t qb81 = vec_xl(16, q8); +// int8x16_t q8b2 = vec_xl(32, q8); +// int8x16_t q8b3 = vec_xl(48, q8); +// q8 += 64; + +// memcpy(aux32, q2, 4 * sizeof(uint32_t)); +// q2 += 8; + +// int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) }; +// int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) }; +// int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) }; +// int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) }; + +// int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) }; +// int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) }; +// int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) }; +// int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) }; + +// q2u0 = vec_mul(q2u0, q2s0); +// q2u1 = vec_mul(q2u1, q2s1); +// q2u2 = vec_mul(q2u2, q2s2); +// q2u3 = vec_mul(q2u3, q2s3); + +// const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1); +// const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3); + +// sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28)); +// sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28)); +// } + +// sumf += d * (sumf1 + sumf2); +// } + +// *s = 0.25f * sumf; + +// #else + +// uint32_t aux32[2]; +// const uint8_t * aux8 = (const uint8_t *)aux32; + +// float sumf = 0.f; +// for (int i = 0; i < nb; ++i) { +// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; +// const uint16_t * GGML_RESTRICT q2 = x[i].qs; +// const int8_t * GGML_RESTRICT q8 = y[i].qs; +// int32_t bsum = 0; +// for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { +// memcpy(aux32, q2, 2*sizeof(uint32_t)); +// q2 += 4; +// const uint32_t ls = 2*(aux32[1] >> 28) + 1; +// int32_t sumi = 0; +// for (int l = 0; l < 4; ++l) { +// const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); +// const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; +// for (int j = 0; j < 8; ++j) { +// sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); +// } +// q8 += 8; +// } +// bsum += sumi * ls; +// } +// sumf += d * bsum; +// } +// *s = 0.125f * sumf; +// #endif +// } + +void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + int ib = 0; + float sumf = 0; + +#if defined(__VXE__) || defined(__VXE2__) + const int8x16_t v_k = vec_xl(0, kvalues_iq4nl); + const uint8x16_t v_m = vec_splat_u8(0x0F); + + for (; ib < nb; ++ib) { + const block_iq4_nl * GGML_RESTRICT x0 = &x[ib]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + + const uint8x16_t v_x = vec_xl(0, x0->qs); + int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); + int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); + + v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl); + v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh); + + const int8x16_t v_yl = vec_xl(0 , y0->qs); + const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs); + const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); + + sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]); + } + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +} + +void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK_K == 0); + + const block_iq4_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__VXE__) || defined(__VXE2__) + const int8x16_t v_k = vec_xl(0, kvalues_iq4nl); + const uint8x16_t v_m = vec_splat_u8(0x0F); + + float sumf = 0; + + for (int ibl = 0; ibl < nb; ++ibl) { + const uint8_t * GGML_RESTRICT q4 = x[ibl].qs; + const int8_t * GGML_RESTRICT q8 = y[ibl].qs; + + uint16_t h = x[ibl].scales_h; + + int sumi1 = 0, sumi2 = 0; + for (int ib = 0; ib < QK_K/64; ++ib) { + const uint8x16_t v_x0 = vec_xl(0 , q4); + const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4); + q4 += 32; + + int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); + int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); + int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); + int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); + + v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l); + v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h); + v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l); + v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h); + + const int8x16_t v_y0 = vec_xl( 0, q8); + const int8x16_t v_y1 = vec_xl(16, q8); + const int8x16_t v_y2 = vec_xl(32, q8); + const int8x16_t v_y3 = vec_xl(48, q8); + q8 += 64; + + int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1); + int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3); + + int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32; + int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32; + + h >>= 4; + + sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1; + sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2; + } + + sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); + } + + *s = sumf; + +#else + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + uint16_t h = x[ibl].scales_h; + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + for (int ib = 0; ib < QK_K/32; ib += 2) { + const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30); + const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30); + h >>= 4; + const float d1 = d4d8*(ls1 - 32); + const float d2 = d4d8*(ls2 - 32); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d1 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + sumi1 = sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d2 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + } + } + *s = sumf; +#endif +} + diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c new file mode 100644 index 0000000000000..b0904d8a3ab5e --- /dev/null +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -0,0 +1,1481 @@ +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" +#include "ggml-quants.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "simd-mappings.h" + +#include "../../quants.h" +#include "../../ggml-cpu-impl.h" + +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#define UNUSED GGML_UNUSED + +#if defined(__wasm_simd128__) +#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s +#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) +#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) +#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) +#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) +#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) +#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) +#define B8(c,s ) B7(c,s, c), B7(c,s, s) + +// precomputed tables for expanding 8bits to 8 bytes: +static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 +static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 +#endif + +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * GGML_RESTRICT y = vy; + +#if defined __wasm_simd128__ + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + } + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_ref(x, y, k); +#endif +} + +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * GGML_RESTRICT y = vy; +#if defined __wasm_simd128__ + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_CPU_FP32_TO_FP16(d); + + v128_t accv = wasm_i32x4_splat(0); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + + accv = wasm_i32x4_add(accv, vi); + } + + y[i].s = GGML_CPU_FP32_TO_FP16( + d * (wasm_i32x4_extract_lane(accv, 0) + + wasm_i32x4_extract_lane(accv, 1) + + wasm_i32x4_extract_lane(accv, 2) + + wasm_i32x4_extract_lane(accv, 3))); + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_1_ref(x, y, k); +#endif +} + +//===================================== Q8_K ============================================== + +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { +#ifdef __wasm_simd128__ + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type + + for (int i = 0; i < nb; i++) { + const float * x_block = x + i * QK_K; + + v128_t min_vec = wasm_v128_load(x_block); + v128_t max_vec = min_vec; + + for (int j = 4; j < QK_K; j += 4) { + v128_t x_vec = wasm_v128_load(x_block + j); + max_vec = wasm_f32x4_pmax(max_vec, x_vec); + min_vec = wasm_f32x4_pmin(min_vec, x_vec); + } + max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1)); + max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2)); + min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1)); + min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2)); + float max = wasm_f32x4_extract_lane(max_vec, 0); + float min = wasm_f32x4_extract_lane(min_vec, 0); + float amax = -min > max ? min : max; + + if (amax == 0.0f) { + yc[i].d = 0.0f; + const v128_t zero = wasm_i8x16_splat(0); + for (int j = 0; j < QK_K; j += 16) { + wasm_v128_store(yc[i].qs + j, zero); + } + continue; + } + + const float iscale = -127.0f / amax; + const v128_t scale_vec = wasm_f32x4_splat(iscale); + + // Process 16 elements per iteration + for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) { + // Load and quantize 16 floats + v128_t x0 = wasm_v128_load(x_block + j); + v128_t x1 = wasm_v128_load(x_block + j + 4); + v128_t x2 = wasm_v128_load(x_block + j + 8); + v128_t x3 = wasm_v128_load(x_block + j + 12); + + v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec)); + v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec)); + v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec)); + v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec)); + + // Convert to i32 with saturation + v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0); + v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1); + v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2); + v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3); + + // Pack into 16 i8 values + v128_t i8 = wasm_i8x16_narrow_i16x8( + wasm_i16x8_narrow_i32x4(i0, i1), + wasm_i16x8_narrow_i32x4(i2, i3) + ); + wasm_v128_store(yc[i].qs + j, i8); + + // Calculate bsums using SIMD + v128_t sum16 = wasm_i16x8_add( + wasm_i16x8_extend_low_i8x16(i8), + wasm_i16x8_extend_high_i8x16(i8) + ); + v128_t sum32 = wasm_i32x4_add( + wasm_i32x4_extend_low_i16x8(sum16), + wasm_i32x4_extend_high_i16x8(sum16) + ); + sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1)); + sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2)); + yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0); + } + + yc[i].d = 1.0f / iscale; + } +#else + quantize_row_q8_K_ref(x, y, k); +#endif +} + + +//===================================== Dot products ================================= + +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined __wasm_simd128__ + v128_t sumv = wasm_f32x4_splat(0.0f); + + const v128_t m4b = wasm_i8x16_splat(0x0F); + const v128_t s8b = wasm_i8x16_splat(0x8); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + // Load and process x0 + v128_t v0_0 = wasm_v128_load(x0->qs); + v128_t v0_0l = wasm_v128_and(v0_0, m4b); + v128_t v0_0h = wasm_u8x16_shr(v0_0, 4); + v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b); + v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b); + + // Load y0 vectors + v128_t y0_l = wasm_v128_load(y0->qs); + v128_t y0_h = wasm_v128_load(y0->qs + 16); + + // Extend to i16x8 and compute dot products + v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls); + v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls); + v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs); + v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs); + + v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l); + v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l); + v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h); + v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h); + + v128_t dp0 = wasm_i32x4_add( + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(dx0l, dy0ll), + wasm_i32x4_dot_i16x8(dx0h, dy0lh) + ), + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(dx0hl, dy0hl), + wasm_i32x4_dot_i16x8(dx0hh, dy0hh) + ) + ); + + // Load and process x1 + v128_t v0_1 = wasm_v128_load(x1->qs); + v128_t v0_1l = wasm_v128_and(v0_1, m4b); + v128_t v0_1h = wasm_u8x16_shr(v0_1, 4); + v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b); + v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b); + + // Load y1 vectors + v128_t y1_l = wasm_v128_load(y1->qs); + v128_t y1_h = wasm_v128_load(y1->qs + 16); + + // Extend to i16x8 and compute dot products + v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls); + v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls); + v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs); + v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs); + + v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l); + v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l); + v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h); + v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h); + + v128_t dp1 = wasm_i32x4_add( + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(dx1l, dy1ll), + wasm_i32x4_dot_i16x8(dx1h, dy1lh) + ), + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(dx1hl, dy1hl), + wasm_i32x4_dot_i16x8(dx1hh, dy1hh) + ) + ); + + // Accumulate results with scaling + float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d); + float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d); + + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0))); + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1))); + } + + sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; +} + +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined __wasm_simd128__ + v128_t sumv = wasm_f32x4_splat(0.0f); + + uint32_t qh_; + uint64_t tmp[4]; + + // TODO: check if unrolling this is better + for (; ib < nb; ++ib) { + const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + memcpy(&qh_, x0->qh, sizeof(qh_)); + + tmp[0] = table_b2b_1[(qh_ >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh_ >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh_ >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); + const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)))); + } + + sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; + } + + *s = sumf; +} + +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + +#if defined __wasm_simd128__ + v128_t sumv = wasm_f32x4_splat(0.0f); + + float summs = 0.0f; + + uint32_t qh_; + uint64_t tmp[4]; + + // TODO: check if unrolling this is better + for (; ib < nb; ++ib) { + const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; + + summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + memcpy(&qh_, x0->qh, sizeof(qh_)); + + tmp[0] = table_b2b_0[(qh_ >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh_ >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh_ >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // add high bit + const v128_t v0lf = wasm_v128_or(v0l, qhl); + const v128_t v0hf = wasm_v128_or(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + // dot product + sumv = wasm_f32x4_add(sumv, + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)))); + } + + sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined __wasm_simd128__ + v128_t sumv = wasm_f32x4_splat(0.0f); + + for (; ib < nb; ++ib) { + const block_q8_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + + const v128_t x0_0 = wasm_v128_load(x0->qs); + const v128_t x0_1 = wasm_v128_load(x0->qs + 16); + const v128_t y0_0 = wasm_v128_load(y0->qs); + const v128_t y0_1 = wasm_v128_load(y0->qs + 16); + + // Extend 8-bit to 16-bit + const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0); + const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0); + const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1); + const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1); + + const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0); + const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0); + const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1); + const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1); + + // Compute dot products + const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l); + const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h); + const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l); + const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h); + + // Sum all dot products + const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1)); + + // Convert to float and accumulate + const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d); + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale))); + } + + sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); + +#endif + for (; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j]*y[ib].qs[j]; + } + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __wasm_simd128__ + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + // Vectorized summs calculation + v128_t summs_vec = wasm_i32x4_splat(0); + { + v128_t sc_vec = wasm_v128_load(sc); + v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4); + + v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper); + v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper); + + v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]); + v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]); + + summs_vec = wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1), + wasm_i32x4_dot_i16x8(sc_high, bsums2)), + summs_vec + ); + + summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1)); + summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2)); + } + int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0); + + // Vectorized isum calculation + int32_t isum = 0; + const uint8_t * sc_ptr = sc; + const int k_iters = QK_K/128; + + for (int k = 0; k < k_iters; ++k) { + v128_t isum_vec = wasm_i32x4_splat(0); + int shift = 0; + + for (int j = 0; j < 4; ++j) { + const int d0 = (sc_ptr[0] & 0xF); + const int d1 = (sc_ptr[1] & 0xF); + sc_ptr += 2; + + // Process first 16 elements + v128_t q2_0 = wasm_v128_load(q2); + v128_t q8_0 = wasm_v128_load(q8); + v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift); + v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03)); + + // Process next 16 elements + v128_t q2_1 = wasm_v128_load(q2 + 16); + v128_t q8_1 = wasm_v128_load(q8 + 16); + v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift); + v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03)); + + // Calculate dot products + v128_t p0 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q8_0), + wasm_i16x8_extend_low_i8x16(q2_bits_0) + ); + v128_t p1 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q8_0), + wasm_i16x8_extend_high_i8x16(q2_bits_0) + ); + v128_t p2 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q8_1), + wasm_i16x8_extend_low_i8x16(q2_bits_1) + ); + v128_t p3 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q8_1), + wasm_i16x8_extend_high_i8x16(q2_bits_1) + ); + + // Accumulate scaled results + v128_t scaled = wasm_i32x4_add( + wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)), + wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1)) + ); + + isum_vec = wasm_i32x4_add(isum_vec, scaled); + q8 += 32; + shift += 2; + } + q2 += 32; + + // Horizontal sum of isum_vec + isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1)); + isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2)); + isum += wasm_i32x4_extract_lane(isum_vec, 0); + } + + const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf += dall * isum - dmin * summs; + } + + *s = sumf; + +#else + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int isum = 0; + int is = 0; + int d; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + d = sc[is++] & 0xF; + int isuml = 0; + for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + d = sc[is++] & 0xF; + isuml = 0; + for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * isum - dmin * summs; + } + *s = sumf; +#endif +} + +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __wasm_simd128__ + int8_t aux8[QK_K]; + float sums[8] = {0}; + uint32_t auxs[4]; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + // Process blocks with SIMD + int8_t * a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int shift = 0; shift <= 6; shift += 2) { + v128_t v_m = wasm_i8x16_splat(m); + for (int l = 0; l < 32; l += 16) { + v128_t v_q3 = wasm_v128_load(q3 + l); + v128_t v_shift = wasm_i8x16_shr(v_q3, shift); + v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03)); + + v128_t v_hm = wasm_v128_load(hm + l); + v128_t v_mask = wasm_v128_and(v_hm, v_m); + v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0)); + + v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask))); + wasm_v128_store(a + l, v_low2); + } + a += 32; + m <<= 1; + } + q3 += 32; + } + + // Extract scales + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + const int8_t * scales = (const int8_t *)auxs; + + // SIMD dot product with register accumulators + v128_t v_acc0 = wasm_i32x4_splat(0); + v128_t v_acc1 = wasm_i32x4_splat(0); + a = aux8; + for (int j = 0; j < QK_K/16; ++j) { + const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32); + + // Process 16 elements per iteration + for (int k = 0; k < 2; ++k) { + const v128_t v_q8 = wasm_i16x8_load8x8(q8); + const v128_t v_a = wasm_i16x8_load8x8(a); + + v128_t v_prod = wasm_i16x8_mul(v_q8, v_a); + v_prod = wasm_i16x8_mul(v_prod, v_scale); + + v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod)); + v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod)); + + q8 += 8; + a += 8; + } + } + + // Accumulate results + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const v128_t v_d = wasm_f32x4_splat(d); + v128_t v_sum = wasm_f32x4_add( + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d), + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d) + ); + + // Accumulate into sums vector + wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum)); + } + + // Horizontal sum + v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4)); + sumf = wasm_f32x4_extract_lane(v_sum, 0) + + wasm_f32x4_extract_lane(v_sum, 1) + + wasm_f32x4_extract_lane(v_sum, 2) + + wasm_f32x4_extract_lane(v_sum, 3); + + *s = sumf; + +#else + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; + + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +#endif + +} + +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined __wasm_simd128__ + const uint8_t * scales = (const uint8_t*)&utmp[0]; + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign + + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + // Process scales and mins + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + // Sum mins * q8sums + int32_t sumi = 0; + const int16_t * GGML_RESTRICT q8sums = y[i].bsums; + const uint8_t * m = (const uint8_t *)&utmp[2]; + for (int j = 0; j < 16; j += 2) { + sumi += (q8sums[j] + q8sums[j+1]) * m[j/2]; + } + sumf -= dmin * sumi; + + int32_t sumi1 = 0; + int32_t sumi2 = 0; + + for (int j = 0; j < QK_K/64; ++j) { + // Load 64 4-bit weights (32 bytes) + const v128_t q4x0 = wasm_v128_load(q4); + const v128_t q4x1 = wasm_v128_load(q4 + 16); + q4 += 32; + + // Split into low/high nibbles + const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F)); + const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4); + const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F)); + const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4); + + // Load 64 8-bit values (64 bytes) + const v128_t q8x0 = wasm_v128_load(q8); + const v128_t q8x1 = wasm_v128_load(q8 + 16); + const v128_t q8x2 = wasm_v128_load(q8 + 32); + const v128_t q8x3 = wasm_v128_load(q8 + 48); + q8 += 64; + + // Low nibble products + v128_t vacc1 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q4l0), + wasm_i16x8_extend_low_i8x16(q8x0) + ); + vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q4l0), + wasm_i16x8_extend_high_i8x16(q8x0) + )); + vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q4l1), + wasm_i16x8_extend_low_i8x16(q8x1) + )); + vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q4l1), + wasm_i16x8_extend_high_i8x16(q8x1) + )); + + // High nibble products + v128_t vacc2 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q4h0), + wasm_i16x8_extend_low_i8x16(q8x2) + ); + vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q4h0), + wasm_i16x8_extend_high_i8x16(q8x2) + )); + vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q4h1), + wasm_i16x8_extend_low_i8x16(q8x3) + )); + vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q4h1), + wasm_i16x8_extend_high_i8x16(q8x3) + )); + + // Accumulate scaled results + int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) + + wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3); + sumi1 += vacc1_sum * scales[2*j]; + + int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) + + wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3); + sumi2 += vacc2_sum * scales[2*j+1]; + } + + sumf += d * (sumi1 + sumi2); + } + + *s = sumf; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined __wasm_simd128__ + //const uint8_t * scales = (const uint8_t*)&utmp[0]; + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign + + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + // Process scales and mins + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + // Sum mins * q8sums + int32_t sumi_mins = 0; + const int16_t * GGML_RESTRICT q8sums = y[i].bsums; + const uint8_t * m = (const uint8_t *)&utmp[2]; + for (int j = 0; j < 16; j += 2) { + sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2]; + } + sumf -= dmin * sumi_mins; // Correct subtraction + + v128_t qh0 = wasm_v128_load(qh); + v128_t qh1 = wasm_v128_load(qh + 16); + const uint8_t * sc = (const uint8_t *)utmp; + + int32_t sumi = 0; + + for (int j = 0; j < QK_K/64; ++j) { + const int shift = j * 2; + v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift); + v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift); + + v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4); + v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3); + v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4); + v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3); + + v128_t q5_0 = wasm_v128_load(q5); + v128_t q5_1 = wasm_v128_load(q5 + 16); + q5 += 32; + + v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0); + v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0); + v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1); + v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1); + + v128_t q8_0 = wasm_v128_load(q8); + v128_t q8_1 = wasm_v128_load(q8 + 16); + v128_t q8_2 = wasm_v128_load(q8 + 32); + v128_t q8_3 = wasm_v128_load(q8 + 48); + q8 += 64; + + // Process low quants + v128_t pl0 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q5l_0), + wasm_i16x8_extend_low_i8x16(q8_0) + ); + pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q5l_0), + wasm_i16x8_extend_high_i8x16(q8_0) + )); + v128_t pl1 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q5l_1), + wasm_i16x8_extend_low_i8x16(q8_1) + ); + pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q5l_1), + wasm_i16x8_extend_high_i8x16(q8_1) + )); + v128_t sum_low = wasm_i32x4_add(pl0, pl1); + + // Process high quants + v128_t ph0 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q5h_0), + wasm_i16x8_extend_low_i8x16(q8_2) + ); + ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q5h_0), + wasm_i16x8_extend_high_i8x16(q8_2) + )); + v128_t ph1 = wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_low_i8x16(q5h_1), + wasm_i16x8_extend_low_i8x16(q8_3) + ); + ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8( + wasm_i16x8_extend_high_i8x16(q5h_1), + wasm_i16x8_extend_high_i8x16(q8_3) + )); + v128_t sum_high = wasm_i32x4_add(ph0, ph1); + + // Accumulate with scale factors + int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) + + wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3); + int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) + + wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3); + + sumi += sl * sc[2*j] + sh * sc[2*j+1]; + } + + sumf += d * sumi; + } + + *s = sumf; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __wasm_simd128__ + int8_t aux8[QK_K] __attribute__((aligned(16))); + int32_t aux32[8] __attribute__((aligned(16))) = {0}; + float sums[8] __attribute__((aligned(16))) = {0}; + + for (int i = 0; i < nb; ++i) { + // Unpack 6-bit quantized data into aux8 (unchanged) + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + int8_t * a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + + const int8_t * GGML_RESTRICT a_ptr = aux8; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + v128_t acc0 = wasm_i32x4_splat(0); + v128_t acc1 = wasm_i32x4_splat(0); + + for (int j = 0; j < QK_K/16; ++j) { + const int scale = x[i].scales[j]; + const v128_t vscale = wasm_i32x4_splat(scale); + + // Load 16 elements from a and q8 + const v128_t a_vec = wasm_v128_load(a_ptr); + const v128_t q8_vec = wasm_v128_load(q8); + + // Process low 8 elements + v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec); + v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec); + v128_t prod_low = wasm_i16x8_mul(a_low, q8_low); + v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low); + v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low); + + // Process high 8 elements + v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec); + v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec); + v128_t prod_high = wasm_i16x8_mul(a_high, q8_high); + v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high); + v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high); + + // Scale and accumulate + prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale); + prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale); + prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale); + prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale); + + acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo)); + acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi)); + + a_ptr += 16; + q8 += 16; + } + + // Store accumulated results + wasm_v128_store(&aux32[0], acc0); + wasm_v128_store(&aux32[4], acc1); + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) { + sums[l] += d * aux32[l]; + } + } + + // Sum final results + float sumf = 0; + for (int l = 0; l < 8; ++l) { + sumf += sums[l]; + } + *s = sumf; + +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp similarity index 100% rename from ggml/src/ggml-cpu/cpu-feats-x86.cpp rename to ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c new file mode 100644 index 0000000000000..e7527c00a8f17 --- /dev/null +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -0,0 +1,4311 @@ +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" +#include "ggml-quants.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "simd-mappings.h" + +#include "../../quants.h" +#include "../../ggml-cpu-impl.h" + +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#define UNUSED GGML_UNUSED + +// some compilers don't provide _mm256_set_m128i, e.g. gcc 7 +#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) + +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(x, x); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(y, x); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + const __m128i ones = _mm_set1_epi16(1); + return _mm_madd_epi16(ones, dot); +} + +#if __AVX__ || __AVX2__ || __AVX512F__ +// horizontally add 8 floats +static inline float hsum_float_8(const __m256 x) { + __m128 res = _mm256_extractf128_ps(x, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(x)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + return _mm_cvtss_f32(res); +} + +// horizontally add 8 int32_t +static inline int hsum_i32_8(const __m256i a) { + const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); + const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); + const __m128i sum64 = _mm_add_epi32(hi64, sum128); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +// horizontally add 4 int32_t +static inline int hsum_i32_4(const __m128i a) { + const __m128i hi64 = _mm_unpackhi_epi64(a, a); + const __m128i sum64 = _mm_add_epi32(hi64, a); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +#if defined(__AVX2__) || defined(__AVX512F__) +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m256i shuf_mask = _mm256_set_epi64x( + 0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000); + __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); + const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytes = _mm256_or_si256(bytes, bit_mask); + return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); + const __m256i lowMask = _mm256_set1_epi8( 0xF ); + return _mm256_and_si256(lowMask, bytes); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m256i x) { + const __m256i ones = _mm256_set1_epi16(1); + const __m256i summed_pairs = _mm256_madd_epi16(ones, x); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { +#if defined(__AVX512VNNI__) && defined(__AVX512VL__) + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); + return _mm256_cvtepi32_ps(summed_pairs); +#elif defined(__AVXVNNI__) + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + return sum_i16_pairs_float(dot); +#endif +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { +#if __AVXVNNIINT8__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(x, x); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(y, x); + return mul_sum_us8_pairs_float(ax, sy); +#endif +} + +static inline __m128i packNibbles( __m256i bytes ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh +#if __AVX512F__ + const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 + bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh + return _mm256_cvtepi16_epi8(bytes); // abcd_efgh +#else + const __m256i lowByte = _mm256_set1_epi16( 0xFF ); + __m256i high = _mm256_andnot_si256( lowByte, bytes ); + __m256i low = _mm256_and_si256( lowByte, bytes ); + high = _mm256_srli_epi16( high, 4 ); + bytes = _mm256_or_si256( low, high ); + + // Compress uint16_t lanes into bytes + __m128i r0 = _mm256_castsi256_si128( bytes ); + __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); + return _mm_packus_epi16( r0, r1 ); +#endif +} +#elif defined(__AVX__) +static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh + const __m128i lowByte = _mm_set1_epi16( 0xFF ); + __m128i high = _mm_andnot_si128( lowByte, bytes1 ); + __m128i low = _mm_and_si128( lowByte, bytes1 ); + high = _mm_srli_epi16( high, 4 ); + bytes1 = _mm_or_si128( low, high ); + high = _mm_andnot_si128( lowByte, bytes2 ); + low = _mm_and_si128( lowByte, bytes2 ); + high = _mm_srli_epi16( high, 4 ); + bytes2 = _mm_or_si128( low, high ); + + return _mm_packus_epi16( bytes1, bytes2); +} + +static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { + const __m128i ax = _mm_sign_epi8(x, x); + const __m128i sy = _mm_sign_epi8(y, x); + return _mm_maddubs_epi16(ax, sy); +} + +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); + __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); + __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); + const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytesl = _mm_or_si128(bytesl, bit_mask); + bytesh = _mm_or_si128(bytesh, bit_mask); + bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); + bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); + return MM256_SET_M128I(bytesh, bytesl); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + // Load 16 bytes from memory + __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); + __m128i tmph = _mm_srli_epi16(tmpl, 4); + const __m128i lowMask = _mm_set1_epi8(0xF); + tmpl = _mm_and_si128(lowMask, tmpl); + tmph = _mm_and_si128(lowMask, tmph); + return MM256_SET_M128I(tmph, tmpl); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { + const __m128i ones = _mm_set1_epi16(1); + const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); + const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); + const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { + const __m128i axl = _mm256_castsi256_si128(ax); + const __m128i axh = _mm256_extractf128_si256(ax, 1); + const __m128i syl = _mm256_castsi256_si128(sy); + const __m128i syh = _mm256_extractf128_si256(sy, 1); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + const __m128i xl = _mm256_castsi256_si128(x); + const __m128i xh = _mm256_extractf128_si256(x, 1); + const __m128i yl = _mm256_castsi256_si128(y); + const __m128i yh = _mm256_extractf128_si256(y, 1); + // Get absolute values of x vectors + const __m128i axl = _mm_sign_epi8(xl, xl); + const __m128i axh = _mm_sign_epi8(xh, xh); + // Sign the values of the y vectors + const __m128i syl = _mm_sign_epi8(yl, xl); + const __m128i syh = _mm_sign_epi8(yh, xh); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors +static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1, + const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) { + const __m128i mone = _mm_set1_epi16(1); + + const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0); + const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1); + const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0); + const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1); + const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone); + const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone); + const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone); + const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone); + const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1); + const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1); + return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1)); +} + +// quad fp16 delta calculation +static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) { + // GGML_CPU_FP16_TO_FP32 is faster than Intel F16C + return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)), + _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0))); +} +#endif +#elif defined(__SSSE3__) +// horizontally add 4x4 floats +static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { + __m128 res_0 =_mm_hadd_ps(a, b); + __m128 res_1 =_mm_hadd_ps(c, d); + __m128 res =_mm_hadd_ps(res_0, res_1); + res =_mm_hadd_ps(res, res); + res =_mm_hadd_ps(res, res); + + return _mm_cvtss_f32(res); +} +#endif // __AVX__ || __AVX2__ || __AVX512F__ +#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) + +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = GGML_CPU_FP32_TO_FP16(d); + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_ref(x, y, k); +#endif +} + +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * GGML_RESTRICT y = vy; +#if defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float max_scalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = max_scalar / 127.f; + y[i].d = GGML_CPU_FP32_TO_FP16(d); + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Compute the sum of the quants and set y[i].s + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Compute the sum of the quants and set y[i].s + const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); + const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_1_ref(x, y, k); +#endif +} + +// placeholder implementation for Apple targets +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q8_K_ref(x, y, k); +} + +//===================================== Dot products ================================= + +// +// Helper functions +// + +#if __AVX__ || __AVX2__ || __AVX512F__ + +// shuffles to pick the required scales in dot products +static inline __m256i get_scale_shuffle_q3k(int i) { + static const uint8_t k_shuffle[128] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + }; + return _mm256_loadu_si256((const __m256i*)k_shuffle + i); +} +static inline __m256i get_scale_shuffle_k4(int i) { + static const uint8_t k_shuffle[256] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 + }; + return _mm256_loadu_si256((const __m256i*)k_shuffle + i); +} +static inline __m128i get_scale_shuffle(int i) { + static const uint8_t k_shuffle[128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11, + 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13, + 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15 + }; + return _mm_loadu_si128((const __m128i*)k_shuffle + i); +} +#endif + +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (; ib < nb; ++ib) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); + + __m256i qx = bytes_from_nibbles_32(x[ib].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + qx = _mm256_sub_epi8( qx, off ); + + __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( d, q, acc ); + } + + sumf = hsum_float_8(acc); +#elif defined(__AVX__) + __m256 accum = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); + + const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8)); + const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8)); + const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8)); + const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8)); + + const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); + const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); + const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); + const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); + const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1); + const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1); + const __m256 p = sum_i16_pairs_float(p_2, p_1); + + const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); + accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); + } + + sumf = hsum_float_8(accum); +#elif defined(__SSSE3__) + // set constants + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + // Initialize accumulator with zeros + __m128 acc_0 = _mm_setzero_ps(); + __m128 acc_1 = _mm_setzero_ps(); + __m128 acc_2 = _mm_setzero_ps(); + __m128 acc_3 = _mm_setzero_ps(); + + for (; ib + 1 < nb; ib += 2) { + _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); + __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); + __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); + __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); + + // Acummulate + acc_0 = _mm_add_ps(p0_d, acc_0); + acc_1 = _mm_add_ps(p1_d, acc_1); + acc_2 = _mm_add_ps(p2_d, acc_2); + acc_3 = _mm_add_ps(p3_d, acc_3); + } + + sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; +} + +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0; + + // Main loop + for (; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d); + + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + + const __m256 d0v = _mm256_set1_ps( d0 ); + const __m256 d1v = _mm256_set1_ps( d1 ); + + // Compute combined scales + const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i qx = bytes_from_nibbles_32(x[ib].qs); + const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs ); + + const __m256 xy = mul_sum_us8_pairs_float(qx, qy); + + // Accumulate d0*d1*x*y +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d0d1, xy, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); +#endif + } + + sumf = hsum_float_8(acc) + summs; + +#endif + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F); + const int v1 = (x[ib].qs[j] >> 4); + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (; ib < nb; ++ib) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + + __m256i qx = bytes_from_nibbles_32(x[ib].qs); + __m256i bxhi = bytes_from_bits_32(x[ib].qh); + bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); + qx = _mm256_or_si256(qx, bxhi); + + __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps(d, q, acc); + } + + sumf = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8((char)0xF0); + + // Main loop + for (; ib < nb; ++ib) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + + __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); + const __m256i bxhi = bytes_from_bits_32(x[ib].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_andnot_si128(bxhil, mask); + bxhih = _mm_andnot_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx_0); + __m128i bxh = _mm256_extractf128_si256(bx_0, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx_0 = MM256_SET_M128I(bxh, bxl); + + const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0); + + /* Multiply q with scale and accumulate */ + acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); + } + + sumf = hsum_float_8(acc); + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; + } + + *s = sumf; +} + +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + +#if defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0.0f; + + // Main loop + for (; ib < nb; ++ib) { + const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); + + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + + __m256i qx = bytes_from_nibbles_32(x[ib].qs); + __m256i bxhi = bytes_from_bits_32(x[ib].qh); + bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); + qx = _mm256_or_si256(qx, bxhi); + + const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d)); + const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); + + const __m256 q = mul_sum_us8_pairs_float(qx, qy); + + acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); + } + + sumf = hsum_float_8(acc) + summs; +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8(0x10); + + float summs = 0.0f; + + // Main loop + for (; ib < nb; ++ib) { + const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); + + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + + __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); + const __m256i bxhi = bytes_from_bits_32(x[ib].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_and_si128(bxhil, mask); + bxhih = _mm_and_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx_0); + __m128i bxh = _mm256_extractf128_si256(bx_0, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx_0 = MM256_SET_M128I(bxh, bxl); + + const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d)); + const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs); + + const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0); + + acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); + } + + sumf = hsum_float_8(acc) + summs; + +#endif + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (; ib < nb; ++ib) { + // Compute combined scale for the block + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs); + __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); + + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + // Multiply q with scale and accumulate + acc = _mm256_fmadd_ps( d, q, acc ); + } + + sumf = hsum_float_8(acc); +#elif defined(__AVX__) + __m256 accum = _mm256_setzero_ps(); + + for (; ib + 1 < nb; ib += 2) { + const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs); + const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1); + const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1); + const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); + const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1); + const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); + + const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1); + const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); + accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); + } + + sumf = hsum_float_8(accum); + +#endif + for (; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j]*y[ib].qs[j]; + } + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq1_0 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__AVX2__) + __m256 sumf = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + // 16-bit sums + __m256i sumi0 = _mm256_setzero_si256(); + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + + // first 32 bytes of 5 elements + { + __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs)); + // 8-bit multiplies with shifts, masks and adds + __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3 + __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9 + __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9 + __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9 + + // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits? + + // Cancel the +1 from avg so that it behaves like a halving add + qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1)); + qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1)); + qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1)); + qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1)); + qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1)); + // Multiply by 3 and get the top 2 bits + qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256())); + qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256())); + qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256())); + qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256())); + qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256())); + qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3)); + qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3)); + qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3)); + qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3)); + qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3)); + + const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0)); + const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32)); + const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64)); + const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96)); + const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128)); + + qx0 = _mm256_maddubs_epi16(qx0, qy0); + qx1 = _mm256_maddubs_epi16(qx1, qy1); + qx2 = _mm256_maddubs_epi16(qx2, qy2); + qx3 = _mm256_maddubs_epi16(qx3, qy3); + qx4 = _mm256_maddubs_epi16(qx4, qy4); + + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); + sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); + sumi2 = _mm256_add_epi16(sumi2, qx4); + } + + // last 16 bytes of 5-element, along with the 4 bytes of 4 elements + { + __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32)); + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned + __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh)); + __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3 + __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9 + __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9 + __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9 + __m256i qx01 = MM256_SET_M128I(qx1, qx0); + __m256i qx23 = MM256_SET_M128I(qx3, qx2); + + // avx2 does not have 8-bit multiplies, so 16-bit it is. + qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1)); + qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF)); + __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1)); + + __m256i qx45 = MM256_SET_M128I(qx5, qx4); + + // Cancel the +1 from avg so that it behaves like a halving add + qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1)); + qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1)); + qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1)); + // Multiply by 3 and get the top 2 bits + qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256())); + qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256())); + qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256())); + qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3)); + qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3)); + qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3)); + + const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160)); + const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192)); + const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224)); + + qx01 = _mm256_maddubs_epi16(qx01, qy01); + qx23 = _mm256_maddubs_epi16(qx23, qy23); + qx45 = _mm256_maddubs_epi16(qx45, qy45); + + sumi0 = _mm256_add_epi16(sumi0, qx01); + sumi1 = _mm256_add_epi16(sumi1, qx23); + sumi2 = _mm256_add_epi16(sumi2, qx45); + } + + const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d)); + + sumi0 = _mm256_sub_epi16(sumi0, ysum); + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2)); + sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); + + sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); + } + + *s = hsum_float_8(sumf); + +#else + const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; + + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int sum = 0; + + for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = x[i].qs[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*32 + m]; + } + } + } + for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = x[i].qs[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*16 + m]; + } + } + } + + for (size_t l = 0; l < 4; ++l) { + for (size_t j = 0; j < sizeof(x->qh); ++j) { + uint8_t q = x[i].qh[j] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j]; + } + } + + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq2_0 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__AVX2__) + __m256 sumf = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + // 16-bit sums, because 256*127 still fits + __m256i sumi0 = _mm256_setzero_si256(); + __m256i sumi1 = _mm256_setzero_si256(); + + for (size_t j = 0; j < sizeof(x->qs); j += 32) { + __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j)); + __m256i qx1 = _mm256_srli_epi16(qx0, 2); + __m256i qx2 = _mm256_srli_epi16(qx0, 4); + __m256i qx3 = _mm256_srli_epi16(qx0, 6); + + // 0, 1, 2 (should not be 3) + qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3)); + qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3)); + qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3)); + qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3)); + + const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0)); + const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32)); + const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64)); + const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96)); + + qx0 = _mm256_maddubs_epi16(qx0, qy0); + qx1 = _mm256_maddubs_epi16(qx1, qy1); + qx2 = _mm256_maddubs_epi16(qx2, qy2); + qx3 = _mm256_maddubs_epi16(qx3, qy3); + + sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); + sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); + } + + const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d)); + + sumi0 = _mm256_add_epi16(sumi0, sumi1); + sumi0 = _mm256_sub_epi16(sumi0, ysum); + sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); + + sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); + } + + *s = hsum_float_8(sumf); + +#else + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int32_t sumi = 0; + + for (size_t j = 0; j < sizeof(x->qs); j += 32) { + for (size_t l = 0; l < 4; ++l) { + for (size_t k = 0; k < 32; ++k) { + sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1); + } + } + } + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + sumf += (float) sumi * d; + } + + *s = sumf; +#endif +} + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + + const __m256i m3 = _mm256_set1_epi8(3); + const __m128i m4 = _mm_set1_epi8(0xF); + + __m256 acc = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); + const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); + const __m256i mins = _mm256_cvtepi8_epi16(mins8); + const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums)); + + acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc); + + const __m256i all_scales = _mm256_cvtepi8_epi16(scales8); + const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); + const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); + const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; + + __m256i sumi = _mm256_setzero_si256(); + + for (int j = 0; j < QK_K/128; ++j) { + + const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32; + + const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + + const __m256i q2_0 = _mm256_and_si256(q2bits, m3); + const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3); + const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3); + const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3); + + __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0); + __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1); + __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2); + __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3); + + p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0); + p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1); + p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2); + p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3); + + p0 = _mm256_add_epi32(p0, p1); + p2 = _mm256_add_epi32(p2, p3); + + sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2)); + } + + acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); + + } + + *s = hsum_float_8(acc); + +#elif defined __AVX__ + + const __m128i m3 = _mm_set1_epi8(0x3); + const __m128i m4 = _mm_set1_epi8(0xF); + const __m128i m2 = _mm_set1_epi8(0x2); + + __m256 acc = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + // load mins and scales from block_q2_K.scales[QK_K/16] + const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales16 = _mm_and_si128(mins_and_scales, m4); + const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); + const __m128i mins_0 = _mm_cvtepi8_epi16(mins16); + const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16)); + + // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2 + const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0])); + const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8])); + + // sumf += -dmin * summs in 32bits*8 + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc); + + const __m128i scales_0 = _mm_cvtepi8_epi16(scales16); + const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16)); + const __m128i scales[2] = { scales_0, scales_1 }; + + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + for (int j = 0; j < QK_K/128; ++j) { + + // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K] + const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + + // load 2bits*16*8 from block_q2_K.qs[QK_K/4] + __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; + const __m128i q2_0 = _mm_and_si128(q2bits, m3); + const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); + const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); + const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); + q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; + const __m128i q2_1 = _mm_and_si128(q2bits, m3); + const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); + const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); + const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); + + // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8 + __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0); + __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1); + __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2); + __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3); + __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4); + __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5); + __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6); + __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7); + + // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8 + __m128i shuffle = _mm_set1_epi16(0x0100); + p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0); + shuffle = _mm_add_epi16(shuffle, m2); + p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1); + shuffle = _mm_add_epi16(shuffle, m2); + p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2); + shuffle = _mm_add_epi16(shuffle, m2); + p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3); + shuffle = _mm_add_epi16(shuffle, m2); + p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4); + shuffle = _mm_add_epi16(shuffle, m2); + p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5); + shuffle = _mm_add_epi16(shuffle, m2); + p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6); + shuffle = _mm_add_epi16(shuffle, m2); + p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7); + + p0 = _mm_add_epi32(p0, p1); + p2 = _mm_add_epi32(p2, p3); + p4 = _mm_add_epi32(p4, p5); + p6 = _mm_add_epi32(p6, p7); + + // isum in 32bits*4*2 + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6)); + } + + // sumf += dall * isum - dmin * summs in 32bits + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc); + } + + *s = hsum_float_8(acc); + +#else + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int isum = 0; + int is = 0; + int d; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + d = sc[is++] & 0xF; + int isuml = 0; + for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + d = sc[is++] & 0xF; + isuml = 0; + for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * isum - dmin * summs; + } + *s = sumf; +#endif +} + +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + + const __m256i m3 = _mm256_set1_epi8(3); + const __m256i mone = _mm256_set1_epi8(1); + const __m128i m32 = _mm_set1_epi8(32); + + __m256 acc = _mm256_setzero_ps(); + + uint32_t aux[3]; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + // Set up scales + memcpy(aux, x[i].scales, 12); + __m128i scales128 = _mm_set_epi32( + ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), + ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), + (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), + (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); + scales128 = _mm_sub_epi8(scales128, m32); + const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); + const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); + const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); + const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; + + // high bit + const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask); + + // integer accumulator + __m256i sumi = _mm256_setzero_si256(); + + int bit = 0; + int is = 0; + + for (int j = 0; j < QK_K/128; ++j) { + // load low 2 bits + const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32; + + // prepare low and high bits + const __m256i q3l_0 = _mm256_and_si256(q3bits, m3); + const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); + ++bit; + + const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3); + const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); + ++bit; + + const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3); + const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); + ++bit; + + const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3); + const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); + ++bit; + + // load Q8 quants + const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + + // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, + // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, + // and 2 if the high bit was set) + __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0); + __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1); + __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2); + __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3); + + __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0); + __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1); + __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2); + __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3); + + p16_0 = _mm256_sub_epi16(p16_0, q8s_0); + p16_1 = _mm256_sub_epi16(p16_1, q8s_1); + p16_2 = _mm256_sub_epi16(p16_2, q8s_2); + p16_3 = _mm256_sub_epi16(p16_3, q8s_3); + + // multiply with scales + p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0); + p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1); + p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2); + p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3); + + // accumulate + p16_0 = _mm256_add_epi32(p16_0, p16_1); + p16_2 = _mm256_add_epi32(p16_2, p16_3); + sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2)); + + } + + // multiply with block scale and accumulate + acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); + + } + + *s = hsum_float_8(acc); + +#elif defined __AVX__ + + const __m128i m3 = _mm_set1_epi8(3); + const __m128i mone = _mm_set1_epi8(1); + const __m128i m32 = _mm_set1_epi8(32); + const __m128i m2 = _mm_set1_epi8(2); + + __m256 acc = _mm256_setzero_ps(); + + const uint32_t *aux; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + // Set up scales + aux = (const uint32_t *)x[i].scales; + __m128i scales128 = _mm_set_epi32( + ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), + ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), + (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), + (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); + scales128 = _mm_sub_epi8(scales128, m32); + const __m128i scales_0 = _mm_cvtepi8_epi16(scales128); + const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128)); + const __m128i scales[2] = { scales_0, scales_1 }; + + // high bit *128*2 from block_q3_K.hmask[QK_K/8] + const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]); + const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]); + + // integer accumulator + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + for (int j = 0; j < QK_K/128; ++j) { + // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4] + const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; + const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; + + // prepare low and high bits + const int bit = j << 2; + + const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3); + const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3); + const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2); + const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2); + + const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3); + const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3); + const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2); + const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2); + + const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3); + const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3); + const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2); + const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2); + + const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3); + const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3); + const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2); + const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2); + + // load Q8 quants from block_q8_K.qs[QK_K] + const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + + // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, + // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, + // and 2 if the high bit was set) + __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0); + __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1); + __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2); + __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3); + __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4); + __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5); + __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6); + __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7); + + __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0); + __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1); + __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2); + __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3); + __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4); + __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5); + __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6); + __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7); + + p16_0 = _mm_sub_epi16(p16_0, q8s_0); + p16_1 = _mm_sub_epi16(p16_1, q8s_1); + p16_2 = _mm_sub_epi16(p16_2, q8s_2); + p16_3 = _mm_sub_epi16(p16_3, q8s_3); + p16_4 = _mm_sub_epi16(p16_4, q8s_4); + p16_5 = _mm_sub_epi16(p16_5, q8s_5); + p16_6 = _mm_sub_epi16(p16_6, q8s_6); + p16_7 = _mm_sub_epi16(p16_7, q8s_7); + + // multiply with scales + __m128i shuffle = _mm_set1_epi16(0x0100); + p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0); + shuffle = _mm_add_epi16(shuffle, m2); + p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1); + shuffle = _mm_add_epi16(shuffle, m2); + p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2); + shuffle = _mm_add_epi16(shuffle, m2); + p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3); + shuffle = _mm_add_epi16(shuffle, m2); + p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4); + shuffle = _mm_add_epi16(shuffle, m2); + p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5); + shuffle = _mm_add_epi16(shuffle, m2); + p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6); + shuffle = _mm_add_epi16(shuffle, m2); + p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7); + + // accumulate + p16_0 = _mm_add_epi32(p16_0, p16_1); + p16_2 = _mm_add_epi32(p16_2, p16_3); + p16_4 = _mm_add_epi32(p16_4, p16_5); + p16_6 = _mm_add_epi32(p16_6, p16_7); + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6)); + + } + + // multiply with block scale and accumulate + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); + + } + + *s = hsum_float_8(acc); + +#else + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; + + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +#endif + +} + +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined __AVX2__ + + const __m256i m4 = _mm256_set1_epi8(0xF); + + __m256 acc = _mm256_setzero_ps(); + __m128 acc_m = _mm_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); + + const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums); + const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); + const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s); + acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m); + + const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); + const __m256i scales = MM256_SET_M128I(sc128, sc128); + + __m256i sumi = _mm256_setzero_si256(); + + for (int j = 0; j < QK_K/64; ++j) { + + const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0)); + const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1)); + + const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; + const __m256i q4l = _mm256_and_si256(q4bits, m4); + const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4); + + const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + __m256i p16l = _mm256_maddubs_epi16(q4l, q8l); + p16l = _mm256_madd_epi16(scale_l, p16l); + + const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + __m256i p16h = _mm256_maddubs_epi16(q4h, q8h); + p16h = _mm256_madd_epi16(scale_h, p16h); + const __m256i sumj = _mm256_add_epi32(p16l, p16h); + + sumi = _mm256_add_epi32(sumi, sumj); + } + + __m256 vd = _mm256_set1_ps(d); + acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc); + + } + + acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m)); + acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m)); + + *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); + +#elif defined __AVX__ + + const __m128i m4 = _mm_set1_epi8(0xF); + const __m128i m2 = _mm_set1_epi8(0x2); + + __m256 acc = _mm256_setzero_ps(); + __m128 acc_m = _mm_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]); + const __m128i scales = _mm_cvtepu8_epi16(utmps); + const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps)); + + const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]); + const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]); + const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1); + const __m128i prod = _mm_madd_epi16(mins, q8s); + acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m); + + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + __m128i shuffle = _mm_set1_epi16(0x0100); + for (int j = 0; j < QK_K/64; ++j) { + + const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi16(shuffle, m2); + const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi16(shuffle, m2); + + __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4l_0 = _mm_and_si128(q4bits, m4); + const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4); + q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4l_1 = _mm_and_si128(q4bits, m4); + const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4); + + const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0); + p16l = _mm_madd_epi16(scale_l, p16l); + sumi_0 = _mm_add_epi32(sumi_0, p16l); + const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + p16l = _mm_maddubs_epi16(q4l_1, q8l_1); + p16l = _mm_madd_epi16(scale_l, p16l); + sumi_1 = _mm_add_epi32(sumi_1, p16l); + + const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0); + p16h = _mm_madd_epi16(scale_h, p16h); + sumi_0 = _mm_add_epi32(sumi_0, p16h); + const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + p16h = _mm_maddubs_epi16(q4h_1, q8h_1); + p16h = _mm_madd_epi16(scale_h, p16h); + sumi_1 = _mm_add_epi32(sumi_1, p16h); + + } + + __m256 vd = _mm256_set1_ps(d); + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); + + } + + acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m)); + acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m)); + + *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + +#if defined __AVX2__ + + const __m256i m4 = _mm256_set1_epi8(0xF); + const __m128i mzero = _mm_setzero_si128(); + const __m256i mone = _mm256_set1_epi8(1); + + __m256 acc = _mm256_setzero_ps(); + + float summs = 0.f; + + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); + + const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums); + const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); + const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s); + const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero); + summs += dmin * _mm_extract_epi32(hsum, 0); + + const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); + const __m256i scales = MM256_SET_M128I(sc128, sc128); + + const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh); + __m256i hmask = mone; + + __m256i sumi = _mm256_setzero_si256(); + + int bit = 0; + + for (int j = 0; j < QK_K/64; ++j) { + + const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0)); + const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1)); + + const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32; + + const __m256i q5l_0 = _mm256_and_si256(q5bits, m4); + const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4); + const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0); + hmask = _mm256_slli_epi16(hmask, 1); + + const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4); + const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4); + const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1); + hmask = _mm256_slli_epi16(hmask, 1); + + const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + + __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0); + __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1); + + p16_0 = _mm256_madd_epi16(scale_0, p16_0); + p16_1 = _mm256_madd_epi16(scale_1, p16_1); + + sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); + + } + + __m256 vd = _mm256_set1_ps(d); + acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc); + + } + + *s = hsum_float_8(acc) + summs; + +#elif defined __AVX__ + + const __m128i m4 = _mm_set1_epi8(0xF); + const __m128i mzero = _mm_setzero_si128(); + const __m128i mone = _mm_set1_epi8(1); + const __m128i m2 = _mm_set1_epi8(2); + + __m256 acc = _mm256_setzero_ps(); + + float summs = 0.f; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]); + const __m128i scales = _mm_cvtepu8_epi16(utmps); + const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps)); + + const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]); + const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]); + const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1); + const __m128i prod = _mm_madd_epi16(mins, q8s); + const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero); + summs += dmin * _mm_extract_epi32(hsum, 0); + + const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]); + const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]); + __m128i hmask = mone; + + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + int bit = 0; + + __m128i shuffle = _mm_set1_epi16(0x0100); + for (int j = 0; j < QK_K/64; ++j) { + + const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi16(shuffle, m2); + const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi16(shuffle, m2); + + const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16; + const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16; + + __m128i q5l_0 = _mm_and_si128(q5bits_0, m4); + __m128i q5l_1 = _mm_and_si128(q5bits_1, m4); + __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4); + __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4); + __m128i q5_0 = _mm_add_epi8(q5l_0, q5h_0); + __m128i q5_1 = _mm_add_epi8(q5l_1, q5h_1); + hmask = _mm_slli_epi16(hmask, 1); + + __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0); + __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1); + p16_0 = _mm_madd_epi16(scale_0, p16_0); + p16_1 = _mm_madd_epi16(scale_0, p16_1); + + q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4); + q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4); + q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4); + q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4); + q5_0 = _mm_add_epi8(q5l_0, q5h_0); + q5_1 = _mm_add_epi8(q5l_1, q5h_1); + hmask = _mm_slli_epi16(hmask, 1); + + q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0); + __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1); + p16_2 = _mm_madd_epi16(scale_1, p16_2); + p16_3 = _mm_madd_epi16(scale_1, p16_3); + + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); + + } + + __m256 vd = _mm256_set1_ps(d); + __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); + + } + + *s = hsum_float_8(acc) + summs; + +#else + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + + const __m256i m4 = _mm256_set1_epi8(0xF); + const __m256i m2 = _mm256_set1_epi8(3); + const __m256i m32s = _mm256_set1_epi8(32); + + __m256 acc = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); + + __m256i sumi = _mm256_setzero_si256(); + + int is = 0; + + for (int j = 0; j < QK_K/128; ++j) { + + const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); + const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); + const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); + const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); + is += 4; + + const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; + const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; + const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32; + + const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4); + const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4); + const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4); + const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4); + + const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); + const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1); + const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2); + const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3); + + const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + + __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0); + __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1); + __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2); + __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3); + + __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0); + __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1); + __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2); + __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3); + + p16_0 = _mm256_sub_epi16(p16_0, q8s_0); + p16_1 = _mm256_sub_epi16(p16_1, q8s_1); + p16_2 = _mm256_sub_epi16(p16_2, q8s_2); + p16_3 = _mm256_sub_epi16(p16_3, q8s_3); + + p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0); + p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1); + p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2); + p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3); + + sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); + sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3)); + + } + + acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); + } + + *s = hsum_float_8(acc); + +#elif defined __AVX__ + + const __m128i m3 = _mm_set1_epi8(3); + const __m128i m15 = _mm_set1_epi8(15); + + __m256 acc = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + // handle the q6_k -32 offset separately using bsums + const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums); + const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1); + const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales); + const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8)); + const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5); + const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5); + + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + int is = 0; + + for (int j = 0; j < QK_K/128; ++j) { + + const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16; + const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16; + + const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4); + const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4); + const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2); + const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2); + const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48)); + const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48)); + const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2); + const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2); + + const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + + const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0); + const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1); + const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2); + const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3); + const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4); + const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5); + const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6); + const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7); + + const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + + __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0); + __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1); + __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2); + __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3); + __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4); + __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5); + __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6); + __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7); + + const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); + const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); + const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); + const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); + is += 4; + + p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); + p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1); + p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); + p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3); + p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4); + p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5); + p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6); + p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7); + + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7)); + + } + + sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0); + sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1); + const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc); + } + + *s = hsum_float_8(acc); + +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +#if defined (__AVX__) || defined (__AVX2__) +static const int8_t keven_signs_q2xs[1024] = { + 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, + 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, + 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, + 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, + 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, + 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, + 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, + 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, + 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, + 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, + 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, + 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, + 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, + 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, + 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, + 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, + 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, + 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, + 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, + 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, + 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, + 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, + 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, + 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, + 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, + 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, + 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, + 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, + 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, + 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, + 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, + 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, +}; +#endif + +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__AVX2__) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; + const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); + const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127], + signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = aux32[1] >> 28; + const uint16_t ls2 = aux32[3] >> 28; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#elif defined(__AVX__) + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; + const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); + const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]); + const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); + const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]); + const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]); + const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); + const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]); + const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0); + const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1); + const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0); + const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1); + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + const uint16_t ls1 = aux32[1] >> 28; + const uint16_t ls2 = aux32[3] >> 28; + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); + sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); + sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); + sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); + sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + uint32_t aux32[2]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(aux32, q2, 2*sizeof(uint32_t)); + q2 += 4; + const uint32_t ls = 2*(aux32[1] >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__AVX2__) + + const __m256i mone = _mm256_set1_epi8(1); + static const char block_sign_shuffle_mask_1[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + }; + static const char block_sign_shuffle_mask_2[32] = { + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, + }; + static const uint8_t bit_selector_mask_bytes[32] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes); + const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1); + const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2); + + static const uint8_t k_bit_helper[32] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper); + const __m256i m511 = _mm256_set1_epi16(511); + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + + uint64_t aux64; + + // somewhat hacky, but gives a significant boost in performance + __m256i aux_gindex; + const uint16_t * gindex = (const uint16_t *)&aux_gindex; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + __m128i stmp = _mm_set1_epi64x(aux64); + stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); + const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); + + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { + + const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16; + aux_gindex = _mm256_and_si256(q2_data, m511); + + const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9); + const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13); + const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper); + + const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); + const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits); + + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + + const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]], + iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]], + iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]); + const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]], + iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]); + const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]], + iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); + + const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits); + const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1); + const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l); + const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h); + + __m256i signs; + signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone)); + + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3); + const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4); + + const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); + const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); + const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2))); + const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3))); + + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4)); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#elif defined(__AVX__) + const __m128i mone = _mm_set1_epi8(1); + static const char block_sign_shuffle_mask_1[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + }; + static const char block_sign_shuffle_mask_2[32] = { + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, + }; + static const uint8_t bit_selector_mask_bytes[32] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes); + const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1); + const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1); + const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1); + const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2); + const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1); + + static const uint8_t k_bit_helper[32] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper); + const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1); + const __m128i m511 = _mm_set1_epi16(511); + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + + uint64_t aux64; + + // somewhat hacky, but gives a significant boost in performance + __m256i aux_gindex; + const uint16_t * gindex = (const uint16_t *)&aux_gindex; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + __m128i stmp = _mm_set1_epi64x(aux64); + stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); + const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); + + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { + + const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2); + const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16; + aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511)); + + const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9); + const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9); + const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13); + const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13); + const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0); + const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1); + + const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0); + const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1); + const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0); + const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1); + + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + + const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); + const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]); + const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); + const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]); + const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]); + const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]); + const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); + const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]); + + // AVX2 full_signs_1 is full_sign_bits_0 here + // AVX2 full_signs_2 is full_sign_bits_1 here + __m128i signs_0, signs_1; + signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0); + signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1); + signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); + signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); + const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone)); + const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone)); + + signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0); + signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1); + signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); + signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); + const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone)); + const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone)); + + signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0); + signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1); + signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); + signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); + const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone)); + const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone)); + + signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0); + signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1); + signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); + signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); + const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone)); + const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone)); + + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0); + const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1); + const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0); + const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1); + + __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)); + const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp); + const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); + sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)); + const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp); + const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); + sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)); + const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp); + const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); + sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)); + const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp); + const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); + + sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0)); + sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1)); + sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0)); + sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1)); + sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0)); + sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1)); + sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0)); + sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1)); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; + const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls1; + sumi = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls2; + q2 += 4; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__AVX2__) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + + const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); + const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); + + uint64_t aux64; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); + const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 + + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], + iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)], + iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], + iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); + const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], + iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)], + iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], + iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); + qs += 8; + + __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); + + aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); + + signs += 4; + + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1 + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3 + + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0))); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1))); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#elif defined(__AVX__) + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + + const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1); + const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1); + const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2); + const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1); + + uint64_t aux64; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); + const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8); + const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8)); + + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], + iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); + const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], + iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]); + const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], + iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); + const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], + iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]); + qs += 8; + + __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16)); + __m128i aux128_1 = aux128_0; + aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); + aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); + const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); + const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); + const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0); + const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1); + + aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16)); + aux128_1 = aux128_0; + aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); + aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); + const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); + const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); + const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0); + const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1); + + signs += 4; + + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0))); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1))); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0))); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1))); + sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); + sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); + sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); + sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint8_t * signs = qs + QK_K/8; + + int bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf); + int ls2 = 1 + 2*(x[i].scales[ib32] >> 4); + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += ls1 * sumi1 + ls2 * sumi2; + qs += 4; + signs += 4; + } + + sumf += d * bsum; + } + + *s = 0.125f * sumf; + +#endif + +} + +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__AVX2__) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + memcpy(aux32, gas, 8); gas += 8; + const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127], + signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); + const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = aux32[0] >> 28; + const uint16_t ls2 = aux32[1] >> 28; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.25f * hsum_float_8(accumf); + +#elif defined(__AVX__) + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]); + q3 += 8; + const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]); + q3 += 8; + memcpy(aux32, gas, 8); gas += 8; + const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); + const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]); + const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]); + const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0); + const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1); + const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0); + const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1); + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + const uint16_t ls1 = aux32[0] >> 28; + const uint16_t ls2 = aux32[1] >> 28; + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); + sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); + sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); + sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); + sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = 0.25f * hsum_float_8(accumf); + +#else + + uint32_t aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); + const uint32_t ls = 2*(aux32 >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]); + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + q3 += 8; + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +#endif +} + +void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined(__AVX2__) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); + const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); + + const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + const __m256i idx_mask = _mm256_set1_epi32(256); + + typedef union { + __m256i vec[2]; + uint32_t index[16]; + } index_t; + + index_t idx; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16; + idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]); + idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]); + idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask); + idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask); + idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l))); + idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1))); + + // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange. + //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4); + //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); + const __m256i q2_1 = _mm256_set_epi32( + iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], + iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] + ); + const __m256i q2_2 = _mm256_set_epi32( + iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], + iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]] + ); + + __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); + + aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); + + signs += 4; + + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; + const uint16_t ls2 = x[i].scales[ib32/2] >> 4; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = hsum_float_8(accumf); + +#elif defined(__AVX__) + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1); + const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1); + const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2); + const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1); + + const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256); + const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16); + const __m128i idx_mask = _mm_set1_epi32(256); + + typedef union { + __m128i vec[4]; + uint32_t index[16]; + } index_t; + + index_t idx; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs); + const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp); + const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16; + idx.vec[0] = _mm_set1_epi32(qh[ib32+0]); + idx.vec[1] = idx.vec[0]; + idx.vec[2] = _mm_set1_epi32(qh[ib32+1]); + idx.vec[3] = idx.vec[2]; + + idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask); + idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask); + idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask); + idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask); + + idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0)); + idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8))); + idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1)); + idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8))); + + const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]); + const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]); + const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]); + const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]); + + __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16)); + __m128i aux128_1 = aux128_0; + aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); + aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); + const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); + const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); + const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0); + const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1); + + aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16)); + aux128_1 = aux128_0; + aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); + aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); + const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); + const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); + const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0); + const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1); + + signs += 4; + + const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); + const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); + const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); + const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); + const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; + const uint16_t ls2 = x[i].scales[ib32/2] >> 4; + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); + sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); + sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); + sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); + sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); + } + + accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); + + } + + *s = hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint8_t * GGML_RESTRICT signs = x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; + const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls1; + sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls2; + } + sumf += d * bsum; + } + *s = sumf; +#endif +} + +#if defined(__AVX2__) +static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { + const __m256i ax = _mm256_sign_epi8(x, x); + const __m256i sy = _mm256_sign_epi8(y, x); + return _mm256_maddubs_epi16(ax, sy); +} +#endif + +void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + + __m256 accum = _mm256_setzero_ps(); + float accum1 = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + __m256i sumi = _mm256_setzero_si256(); + int sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ib += 2) { +#ifdef __BMI2__ + const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL); + const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL); + const uint16_t *idx1 = (const uint16_t *)(&packed_idx1); + const uint16_t *idx2 = (const uint16_t *)(&packed_idx2); + const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]); + const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]); +#else + const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], + iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]); + const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], + iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]); +#endif + qs += 8; + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + + const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); + const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); + const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; + const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2)); + + sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2)); + sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 + + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; + } + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum); + accum1 += d * sumi1; + + } + + *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; + +#elif defined __AVX__ + __m256 accum = _mm256_setzero_ps(); + float accum1 = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + int sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]); + const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]); + const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]); + const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]); + qs += 8; + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + + const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0); + const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1); + const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0); + const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1); + const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; + const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; + const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1)); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1)); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2)); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2)); + + sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0)); + sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1)); + sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 + + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; + } + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum); + accum1 += d * sumi1; + + } + + *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; + +#else + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + int sumi = 0, sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + const int ls = 2*((qh[ib] >> 12) & 7) + 1; + const int delta = qh[ib] & 0x8000 ? -1 : 1; + int lsum = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8))); + for (int j = 0; j < 8; ++j) { + lsum += q8[j] * grid[j]; + } + q8 += 8; + } + sumi += ls * lsum; + sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]); + qs += 4; + } + + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + } + + *s = sumf; + +#endif +} + +void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_m * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + iq1m_scale_t scale; + +#if defined __AVX2__ + + const __m256i mask = _mm256_set1_epi16(0x7); + const __m256i mone = _mm256_set1_epi16(1); + const __m256i mone8 = _mm256_set1_epi8(1); + const __m256i mtwo8 = _mm256_set1_epi8(2); + // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half. + const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + // Extract 3-bit scales (16 values) + __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc); + scales = _mm256_srlv_epi64(scales, scales_shift); + scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone); + + // Indices to repeat each scale 8 times. + __m256i scales_idx1 = _mm256_set1_epi16(0x0100); + __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8)); + + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib = 0; ib < QK_K/32; ib += 2) { +#ifdef __BMI2__ + const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) + | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL); + const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) + | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL); + const uint16_t *idx1 = (const uint16_t *)(&packed_idx1); + const uint16_t *idx2 = (const uint16_t *)(&packed_idx2); + const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]); + const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]); + + // Convert signs to bytes 0x81 (negative) or 0x01 (positive) + const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL); + const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign))); + const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32))); +#else + const __m256i q1b_1 = _mm256_set_epi64x( + iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)], + iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)] + ); + const __m256i q1b_2 = _mm256_set_epi64x( + iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)], + iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)] + ); + + const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, + qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, + qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); +#endif + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + + const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); + const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); + const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1)); + const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2)); + + __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1); + __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2); + + scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8); + scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8); + + const __m256i p1 = _mm256_madd_epi16(dot1, scale1); + const __m256i p2 = _mm256_madd_epi16(dot2, scale2); + const __m256i p3 = _mm256_madd_epi16(dot3, scale1); + const __m256i p4 = _mm256_madd_epi16(dot4, scale2); + + sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4)); + + qs += 8; qh += 4; + } + + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16)); + + accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); + accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); + } + + *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); + +#elif defined __AVX__ + const __m128i mask = _mm_set1_epi16(0x7); + const __m128i mone = _mm_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q1b_1_0 = _mm_set_epi64x( + iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]); + const __m128i q1b_1_1 = _mm_set_epi64x( + iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]); + const __m128i q1b_2_0 = _mm_set_epi64x( + iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]); + const __m128i q1b_2_1 = _mm_set_epi64x( + iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + + const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0); + const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1); + const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0); + const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1); + + const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + + const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0); + const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1); + const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0); + const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1); + + __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0); + __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3); + __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6); + __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9); + + scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone); + scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone); + scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone); + scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone); + const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0); + const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1); + const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0); + const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1); + const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0); + const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1); + const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0); + const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1); + + sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0)); + sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1)); + sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0)); + sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1)); + + qs += 8; qh += 4; + } + + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16)); + + accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1); + accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2); + } + + *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); + +#else + + int sum1[2], sum2[2], delta[4]; + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + int sumi1 = 0, sumi2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + delta[0] = qh[0] & 0x08 ? -1 : 1; + delta[1] = qh[0] & 0x80 ? -1 : 1; + delta[2] = qh[1] & 0x08 ? -1 : 1; + delta[3] = qh[1] & 0x80 ? -1 : 1; + sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700))); + int lsum1 = 0, lsum2 = 0; + for (int j = 0; j < 8; ++j) { + lsum1 += q8[j] * grid[j]; + lsum2 += q8[j]; + } + q8 += 8; + sum1[l/2] += lsum1; + sum2[l/2] += lsum2*delta[l]; + } + + const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1; + const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1; + + sumi1 += sum1[0] * ls1 + sum1[1] * ls2; + sumi2 += sum2[0] * ls1 + sum2[1] * ls2; + qs += 4; + qh += 2; + } + + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + } + + *s = sumf; + +#endif +} + +void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + int ib = 0; + float sumf = 0; + +#if defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + const __m256i mone = _mm256_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs); + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs); + const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); + const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); + const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); + accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), + _mm256_cvtepi32_ps(p_1), accum1); + accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), + _mm256_cvtepi32_ps(p_2), accum2); + } + + sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); + +#elif defined __AVX__ + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + + __m256 accum = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); + + const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); + const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); + const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); + const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); + + const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1); + const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); + accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); + } + + sumf = hsum_float_8(accum); + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +} + +void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK_K == 0); + + const block_iq4_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + +#if defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + + __m256 accum = _mm256_setzero_ps(); + for (int ibl = 0; ibl < nb; ++ibl) { + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + uint16_t sh = x[ibl].scales_h; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16; + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16; + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); + const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; + const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; + sh >>= 4; + const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1)); + const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2)); + sumi1 = _mm256_add_epi32(p_1, sumi1); + sumi2 = _mm256_add_epi32(p_2, sumi2); + } + accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum); + } + + *s = hsum_float_8(accum); + +#elif defined __AVX__ + const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); + const __m128i m4b = _mm_set1_epi8(0x0f); + + __m256 accum = _mm256_setzero_ps(); + for (int ibl = 0; ibl < nb; ++ibl) { + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + uint16_t sh = x[ibl].scales_h; + __m128i sumi1_0 = _mm_setzero_si128(); + __m128i sumi1_1 = _mm_setzero_si128(); + __m128i sumi2_0 = _mm_setzero_si128(); + __m128i sumi2_1 = _mm_setzero_si128(); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16; + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16; + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; + const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); + const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); + const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); + const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); + const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); + const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); + const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); + const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); + const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; + const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; + sh >>= 4; + const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1)); + const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1)); + const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2)); + const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2)); + sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0); + sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1); + sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0); + sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1); + } + __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0); + __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1); + accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum); + } + + *s = hsum_float_8(accum); + +#else + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + uint16_t h = x[ibl].scales_h; + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + for (int ib = 0; ib < QK_K/32; ib += 2) { + const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30); + const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30); + h >>= 4; + const float d1 = d4d8*(ls1 - 32); + const float d2 = d4d8*(ls2 - 32); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d1 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + sumi1 = sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d2 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + } + } + *s = sumf; +#endif +} + diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp similarity index 67% rename from ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp rename to ggml/src/ggml-cpu/arch/x86/repack.cpp index 0a3ff867cfeca..c00c1e541cb44 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -3,72 +3,20 @@ #include "ggml-common.h" #include "ggml-backend-impl.h" -#include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" -#include "ggml-cpu-traits.h" +#include "simd-mappings.h" +#include "traits.h" #include #include #include -#include #include // for qsort #include // for GGML_ASSERT -#include "ggml-cpu-aarch64.h" - -// TODO: move to include file? -template constexpr int QK_0() { - if constexpr (K == 4) { - return QK4_0; - } - if constexpr (K == 8) { - return QK8_0; - } - return -1; -} - -template struct block { - ggml_half d[N]; // deltas for N qK_0 blocks - int8_t qs[(QK_0() * N * K) / 8]; // quants for N qK_0 blocks -}; - -// control size -static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding"); -static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding"); -static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding"); -static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding"); - -using block_q4_0x4 = block<4, 4>; -using block_q4_0x8 = block<4, 8>; -using block_q8_0x4 = block<8, 4>; -using block_q8_0x8 = block<8, 8>; - - -struct block_q4_Kx8 { - ggml_half d[8]; // super-block scale for quantized scales - ggml_half dmin[8]; // super-block scale for quantized mins - uint8_t scales[96]; // scales and mins, quantized with 6 bits - uint8_t qs[1024]; // 4--bit quants -}; - -static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding"); - -struct block_q8_Kx4 { - float d[4]; // delta - int8_t qs[QK_K * 4]; // quants - int16_t bsums[QK_K / 4]; // sum of quants in groups of 16 -}; - -static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding"); - -struct block_iq4_nlx4 { - ggml_half d[4]; // deltas for 4 iq4_nl blocks - uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks -}; - -static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); +#define GGML_CPU_CLANG_WORKAROUND +#include "../../repack.h" #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" @@ -76,27 +24,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro #define UNUSED GGML_UNUSED -static inline int nearest_int(float fval) { - assert(fabsf(fval) <= 4194303.f); - float val = fval + 12582912.f; - int i; memcpy(&i, &val, sizeof(int)); - return (i & 0x007fffff) - 0x00400000; -} - -// Functions to create the interleaved data layout formats - -// interleave 4 block_q4_0s in blocks of blck_size_interleave -// returns an interleaved block_q4_0x4 -// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks -// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave -// -// - in : an array of block_q4_0 pointers -// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of -// blck_size_interleave bytes -// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes -// from bias offset form to pure sign form (this saves subtract -// operations durin unpacking) -// #if defined(__AVX__) #if defined(__F16C__) #if defined(__AVX512F__) @@ -113,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) { float tmp[16]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } for (int i = 0; i < 8; i++) { - tmp[i + 8] = GGML_FP16_TO_FP32(y[i]); + tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]); } return _mm512_loadu_ps(tmp); @@ -128,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) { _mm_storeu_si128((__m128i*)tmphalf, x); for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); } return _mm512_loadu_ps(tmp); @@ -141,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { float tmp[8]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -150,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) { float tmp[8]; for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); - tmp[i + 4] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); + tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -162,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask)); for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); } return _mm256_loadu_ps(tmp); @@ -178,6 +105,12 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang #endif #endif +static inline int nearest_int(float fval) { + assert(fabsf(fval) <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} #if defined(__AVX2__) || defined(__AVX512F__) #if defined(__AVX512F__) @@ -242,188 +175,14 @@ static inline __m256i mul_sum_i8_pairs_acc_int32x8(const __m256i acc, const __m2 } #endif -static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - -static void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(QK8_0 == 32); - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; - - block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; - -#if defined(__ARM_NEON) - float32x4_t srcv[4][8]; - float id[4]; - - for (int i = 0; i < nb; i++) { - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; - - for (int row_iter = 0; row_iter < 4; row_iter++) { - for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j); - for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]); - - for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]); - for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]); - for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]); - - const float amax = vmaxvq_f32(amaxv[0]); - - const float d = amax / ((1 << 7) - 1); - id[row_iter] = d ? 1.0f / d : 0.0f; - - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); - } - - for (int j = 0; j < 8; j++) { - float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]); - int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3); - - v = vmulq_n_f32(srcv[1][j], id[1]); - vi = vcvtnq_s32_f32(v); - y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0); - y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1); - y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2); - y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3); - - v = vmulq_n_f32(srcv[2][j], id[2]); - vi = vcvtnq_s32_f32(v); - y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0); - y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1); - y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2); - y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3); - - v = vmulq_n_f32(srcv[3][j], id[3]); - vi = vcvtnq_s32_f32(v); - y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0); - y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1); - y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2); - y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3); - } - } -#else - // scalar - const int blck_size_interleave = 4; - float srcv[4][QK8_0]; - float id[4]; - - for (int i = 0; i < nb; i++) { - for (int row_iter = 0; row_iter < 4; row_iter++) { - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; - amax = MAX(amax, fabsf(srcv[row_iter][j])); - } - - const float d = amax / ((1 << 7) - 1); - id[row_iter] = d ? 1.0f / d : 0.0f; - - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); - } - - for (int j = 0; j < QK8_0 * 4; j++) { - int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; - int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; - src_offset += (j % blck_size_interleave); - - float x0 = srcv[src_id][src_offset] * id[src_id]; - y[i].qs[j] = roundf(x0); - } - } -#endif -} - -static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { +void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; -#if defined(__ARM_NEON) - float32x4_t srcv[4][8]; - float id[4]; - - for (int i = 0; i < nb; i++) { - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; - - for (int row_iter = 0; row_iter < 4; row_iter++) { - for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j); - for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]); - - for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]); - for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]); - for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]); - - const float amax = vmaxvq_f32(amaxv[0]); - - const float d = amax / ((1 << 7) - 1); - id[row_iter] = d ? 1.0f / d : 0.0f; - - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); - } - - for (int j = 0; j < 4; j++) { - float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]); - int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3); - v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]); - vi = vcvtnq_s32_f32(v); - y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0); - y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1); - y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2); - y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3); - - v = vmulq_n_f32(srcv[1][2 * j], id[1]); - vi = vcvtnq_s32_f32(v); - y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0); - y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1); - y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2); - y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3); - v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]); - vi = vcvtnq_s32_f32(v); - y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0); - y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1); - y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2); - y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3); - - v = vmulq_n_f32(srcv[2][2 * j], id[2]); - vi = vcvtnq_s32_f32(v); - y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0); - y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1); - y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2); - y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3); - v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]); - vi = vcvtnq_s32_f32(v); - y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0); - y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1); - y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2); - y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3); - - v = vmulq_n_f32(srcv[3][2 * j], id[3]); - vi = vcvtnq_s32_f32(v); - y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0); - y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1); - y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2); - y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3); - v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]); - vi = vcvtnq_s32_f32(v); - y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0); - y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1); - y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2); - y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3); - } - } -#elif defined(__AVX2__) || defined(__AVX__) +#if defined(__AVX2__) || defined(__AVX__) float id[4]; __m256 srcv[4][4]; __m256 idvec[4]; @@ -453,7 +212,7 @@ static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGM id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f; // Store the scale for the individual block - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); // Store the values in blocks of eight values - Aim is to use these later for block interleaving srcv[row_iter][0] = v0; @@ -520,6 +279,7 @@ static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGM #endif } } + #else // scalar const int blck_size_interleave = 8; @@ -538,7 +298,7 @@ static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGM const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -553,7 +313,7 @@ static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGM #endif } -static void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { +void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK_K == 256); assert(k % QK_K == 0); const int nb = k / QK_K; @@ -817,203 +577,7 @@ static void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGM #endif } -template -void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row); - -template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { - assert(nrow == 4); - UNUSED(nrow); - ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row); -} - -template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { - assert(nrow == 4); - UNUSED(nrow); - ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row); -} - -template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { - assert(nrow == 4); - UNUSED(nrow); - ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row); -} - -static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 4; - - assert (n % qk == 0); - assert (nc % ncols_interleaved == 0); - - UNUSED(s); - UNUSED(bs); - UNUSED(vx); - UNUSED(vy); - UNUSED(nr); - UNUSED(nc); - UNUSED(nb); - UNUSED(ncols_interleaved); - UNUSED(blocklen); - -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) - if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx; - - for (int c = 0; c < nc; c += ncols_interleaved) { - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - float32x4_t acc = vdupq_n_f32(0); - for (int b = 0; b < nb; b++) { - int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs); - int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16); - int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32); - int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48); - float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d); - - int8x16_t a0 = vld1q_s8(a_ptr->qs); - int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2); - float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d); - - int32x4_t ret = vdupq_n_s32(0); - - ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0); - ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1); - ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2); - ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3); - - ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0); - ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1); - ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2); - ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3); - - acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4), - vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd))); - a_ptr++; - b_ptr++; - } - vst1q_f32(s, acc); - s += ncols_interleaved; - } - return; - } -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) - float sumf[4]; - int sumi; - - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); - - for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; - for (int l = 0; l < nb; l++) { - for (int k = 0; k < (qk / (2 * blocklen)); k++) { - for (int j = 0; j < ncols_interleaved; j++) { - sumi = 0; - for (int i = 0; i < blocklen; ++i) { - const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); - const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); - sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; - } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); - } - } - } - for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; - } -} - -static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 8; - - assert (n % qk == 0); - assert (nc % ncols_interleaved == 0); - - UNUSED(s); - UNUSED(bs); - UNUSED(vx); - UNUSED(vy); - UNUSED(nr); - UNUSED(nc); - UNUSED(nb); - UNUSED(ncols_interleaved); - UNUSED(blocklen); - -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) - if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx; - - for (int c = 0; c < nc; c += ncols_interleaved) { - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - float32x4_t acc = vdupq_n_f32(0); - for (int b = 0; b < nb; b++) { - int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs); - int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16); - int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32); - int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48); - float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d); - - int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs); - int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1); - int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2); - int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3); - float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d); - - int32x4_t ret0 = vdupq_n_s32(0); - int32x4_t ret1 = vdupq_n_s32(0); - - ret0 = vdotq_s32(ret0, b0 << 4, a0); - ret1 = vdotq_s32(ret1, b1 << 4, a0); - ret0 = vdotq_s32(ret0, b2 << 4, a1); - ret1 = vdotq_s32(ret1, b3 << 4, a1); - - ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2); - ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2); - ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3); - ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3); - - int32x4_t ret = vpaddq_s32(ret0, ret1); - - acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4), - vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd))); - a_ptr++; - b_ptr++; - } - vst1q_f32(s, acc); - s += ncols_interleaved; - } - return; - } -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) - float sumf[4]; - int sumi; - - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); - - for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; - for (int l = 0; l < nb; l++) { - for (int k = 0; k < (qk / (2 * blocklen)); k++) { - for (int j = 0; j < ncols_interleaved; j++) { - sumi = 0; - for (int i = 0; i < blocklen; ++i) { - const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); - const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); - sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; - } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); - } - } - } - for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; - } -} - -static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 8; @@ -1032,75 +596,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c UNUSED(ncols_interleaved); UNUSED(blocklen); -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) -#if defined(__ARM_FEATURE_SVE) - if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) { - const void * b_ptr = vx; - const void * a_ptr = vy; - float * res_ptr = s; - - __asm__ __volatile__( - "ptrue p0.b\n" - "add %x[b_ptr], %x[b_ptr], #0x10\n" - "1:" // Column loop - "add x22, %x[a_ptr], #0x2\n" - "mov z31.b, #0x0\n" - "mov x21, %x[nb]\n" - "2:" // Block loop - "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n" - "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n" - "mov z28.s, #0x0\n" - "mov z27.s, #0x0\n" - "ld1rd { z26.d }, p0/Z, [x22]\n" - "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n" - "sub x20, x22, #0x2\n" - "sub x21, x21, #0x1\n" - "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n" - "ld1rd { z23.d }, p0/Z, [x22, #8]\n" - "lsl z22.b, z30.b, #0x4\n" - "lsl z16.b, z29.b, #0x4\n" - "and z30.b, z30.b, #0xf0\n" - "and z29.b, z29.b, #0xf0\n" - "ld1rd { z21.d }, p0/Z, [x22, #16]\n" - "ld1rd { z20.d }, p0/Z, [x22, #24]\n" - "lsl z19.b, z25.b, #0x4\n" - "and z25.b, z25.b, #0xf0\n" - "ld1rh { z17.h }, p0/Z, [x20]\n" - "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n" - "sdot z28.s, z22.b, z26.b\n" - "sdot z27.s, z16.b, z26.b\n" - "lsl z16.b, z24.b, #0x4\n" - "add x22, x22, #0x22\n" - "and z24.b, z24.b, #0xf0\n" - "add %x[b_ptr], %x[b_ptr], #0x90\n" - "fcvt z17.s, p0/m, z17.h\n" - "fcvt z18.s, p0/m, z18.h\n" - "sdot z28.s, z19.b, z23.b\n" - "sdot z27.s, z16.b, z23.b\n" - "fmul z18.s, z18.s, z17.s\n" - "sdot z28.s, z30.b, z21.b\n" - "sdot z27.s, z29.b, z21.b\n" - "sdot z28.s, z25.b, z20.b\n" - "sdot z27.s, z24.b, z20.b\n" - "uzp1 z17.s, z28.s, z27.s\n" - "uzp2 z16.s, z28.s, z27.s\n" - "add z17.s, z17.s, z16.s\n" - "asr z17.s, z17.s, #0x4\n" - "scvtf z17.s, p0/m, z17.s\n" - "fmla z31.s, p0/M, z17.s, z18.s\n" - "cbnz x21, 2b\n" - "sub %x[nc], %x[nc], #0x8\n" - "st1w { z31.s }, p0, [%x[res_ptr]]\n" - "add %x[res_ptr], %x[res_ptr], #0x20\n" - "cbnz %x[nc], 1b\n" - : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) - : [a_ptr] "r" (a_ptr), [nb] "r" (nb) - : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" - ); - return; - } -#endif // #if defined(__ARM_FEATURE_SVE) -#elif defined(__AVX2__) +#if defined(__AVX2__) // Lookup table to convert signed nibbles to signed bytes __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0)); signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0); @@ -1152,7 +648,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask); // Load and convert to FP32 scale from block_q8_0 - const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d)); + const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d)); // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs)); @@ -1191,74 +687,8 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c } } return; -#elif defined __riscv_v - if (__riscv_vlenb() >= QK4_0) { - const size_t vl = QK4_0; - - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); - vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); - for (int l = 0; l < nb; l++) { - const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0]; - const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8]; - const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16]; - const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24]; - __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment - const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4)); - const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4)); - const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4)); - const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4)); - - const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4); - const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4); - const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4); - const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0); - const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1); - const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0); - const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1); - - const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); - const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); - const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); - const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); - - const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m)); - const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); - const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); - const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); - const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); - const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); - const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); - const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); - const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); - const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); - const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); - const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); - const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); - - // vector version needs Zvfhmin extension - const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d); - const float b_scales[8] = { - GGML_FP16_TO_FP32(b_ptr[l].d[0]), - GGML_FP16_TO_FP32(b_ptr[l].d[1]), - GGML_FP16_TO_FP32(b_ptr[l].d[2]), - GGML_FP16_TO_FP32(b_ptr[l].d[3]), - GGML_FP16_TO_FP32(b_ptr[l].d[4]), - GGML_FP16_TO_FP32(b_ptr[l].d[5]), - GGML_FP16_TO_FP32(b_ptr[l].d[6]), - GGML_FP16_TO_FP32(b_ptr[l].d[7]) - }; - const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); - const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4); - sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4); - } - __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4); - } - return; - } -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) +#endif { float sumf[8]; int sumi; @@ -1277,7 +707,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -1286,7 +716,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c } } -static void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK_K; const int nb = n / qk; const int ncols_interleaved = 8; @@ -1543,13 +973,13 @@ static void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; } } for (int sb = 0; sb < 8; sb++) { uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; for (int j = 0; j < ncols_interleaved; j++) { - sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; + sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; } } } @@ -1560,14 +990,14 @@ static void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c #endif } - -static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 4; + const int ncols_interleaved = 8; + const int blocklen = 8; assert (n % qk == 0); + assert (nr % 4 == 0); assert (nc % ncols_interleaved == 0); UNUSED(s); @@ -1580,1529 +1010,49 @@ static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, UNUSED(ncols_interleaved); UNUSED(blocklen); -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) - if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - float * res_ptr = s; +#if defined(__AVX2__) || defined(__AVX512F__) + { + const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx; + const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy; + int64_t b_nb = n / QK4_0; + int64_t y = 0; + // Mask to mask out nibbles from packed bytes + const __m256i m4b = _mm256_set1_epi8(0x0F); + const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3); + // Lookup table to convert signed nibbles to signed bytes + __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0)); + signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0); + // Permute mask used for easier vector processing at later stages + __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4); + int64_t xstart = 0; + int anr = nr - nr%16; // Used to align nr with boundary of 16 + #ifdef __AVX512F__ + int anc = nc - nc%16; // Used to align nc with boundary of 16 + // Mask to mask out nibbles from packed bytes expanded to 512 bit length + const __m512i m4bexpanded = _mm512_set1_epi8(0x0F); + // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length + __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1); - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation + for (; y < anr / 4; y += 4) { - float32x4_t sumf = vdupq_n_f32(0); - for (int l = 0; l < nb; l++) { - uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0); - uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16); - uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32); - uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48); - - int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4); - int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F); - int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4); - int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F); - int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4); - int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F); - int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4); - int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F); - - int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0); - int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16); - - int32x4_t sumi = vdupq_n_s32(0); - sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0); - sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0); - sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1); - sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1); - sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2); - sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2); - sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3); - sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3); - - float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d)); - float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d)); - float32x4_t d = a_d * b_d; - - sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi)); + const block_q8_0x4 * a_ptrs[4]; + + a_ptrs[0] = a_ptr_start + (y * nb); + for (int i = 0; i < 3; ++i) { + a_ptrs[i + 1] = a_ptrs[i] + nb; } - vst1q_f32(res_ptr + x * 4, sumf); - } - return; - } -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - { - float sumf[4]; - int sumi; + // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation + for (int64_t x = 0; x < anc / 8; x += 2) { - const block_q8_0 * a_ptr = (const block_q8_0 *) vy; - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x) * b_nb); + const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb); - for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; - for (int l = 0; l < nb; l++) { - for (int k = 0; k < (qk / (2 * blocklen)); k++) { - for (int j = 0; j < ncols_interleaved; j++) { - sumi = 0; - for (int i = 0; i < blocklen; ++i) { - const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; - const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; - sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); - } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); - } - } - } - for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; - } - } -} - -static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 4; - - assert (n % qk == 0); - assert (nr % 4 == 0); - assert (nc % ncols_interleaved == 0); - - UNUSED(s); - UNUSED(bs); - UNUSED(vx); - UNUSED(vy); - UNUSED(nr); - UNUSED(nc); - UNUSED(nb); - UNUSED(ncols_interleaved); - UNUSED(blocklen); - -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - const void * b_ptr = vx; - const void * a_ptr = vy; - float * res_ptr = s; - size_t res_stride = bs * sizeof(float); - - __asm__ __volatile__( - "mov x10, %x[nr]\n" - "mov x9, #0x88\n" - "cmp x10, #0x10\n" - "mul x9, %x[nb], x9\n" - "blt 4f\n" - "1:" // Row loop - "add x28, %x[b_ptr], #0x8\n" - "mov x27, %x[nc]\n" - "add x26, %x[res_ptr], %x[res_stride], LSL #4\n" - "2:" // Column loop - "add x25, %x[a_ptr], #0x8\n" - "movi v15.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "mov x24, %x[nb]\n" - "add x23, x25, x9\n" - "movi v18.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "add x22, x23, x9\n" - "movi v11.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "add x21, x22, x9\n" - "movi v23.16b, #0x0\n" - "movi v16.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v7.16b, #0x0\n" - "movi v0.16b, #0x0\n" - "movi v4.16b, #0x0\n" - "movi v5.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v8.16b, #0x0\n" - "movi v1.16b, #0x0\n" - "3:" // Block loop - "ldr q3, [x28, #0x0]\n" - "ldr q31, [x25, #0x0]\n" - "movi v28.16b, #0x4\n" - "movi v10.4s, #0x0\n" - "ldr q22, [x28, #0x10]\n" - "ldr q6, [x25, #0x10]\n" - "movi v29.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "ldr q27, [x28, #0x20]\n" - "ldr q30, [x28, #0x30]\n" - "movi v20.4s, #0x0\n" - "movi v24.16b, #0xf0\n" - "ldr d2, [x25, #-0x8]\n" - "ldr d26, [x23, #-0x8]\n" - "sshl v12.16b, v3.16b, v28.16b\n" - "sub x20, x28, #0x8\n" - "ldr d17, [x20, #0x0]\n" - "and v3.16b, v3.16b, v24.16b\n" - "subs x24, x24, #0x1\n" - "add x28, x28, #0x48\n" - ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n" - ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n" - ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n" - ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n" - "sshl v31.16b, v22.16b, v28.16b\n" - "and v22.16b, v22.16b, v24.16b\n" - "fcvtl v17.4s, v17.4h\n" - "fcvtl v2.4s, v2.4h\n" - "fcvtl v26.4s, v26.4h\n" - ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n" - ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n" - ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n" - ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n" - "sshl v6.16b, v27.16b, v28.16b\n" - "sshl v28.16b, v30.16b, v28.16b\n" - "and v27.16b, v27.16b, v24.16b\n" - "and v30.16b, v30.16b, v24.16b\n" - "ldr q24, [x25, #0x20]\n" - ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n" - ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n" - ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n" - ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n" - "ldr q24, [x25, #0x30]\n" - ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n" - ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n" - ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n" - ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n" - "ldr q24, [x25, #0x40]\n" - ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n" - ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n" - ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n" - ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n" - "ldr q24, [x25, #0x50]\n" - ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n" - ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n" - ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n" - ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n" - "ldr q24, [x25, #0x60]\n" - ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n" - ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n" - ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n" - ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n" - "ldr q24, [x25, #0x70]\n" - "add x25, x25, #0x88\n" - ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n" - ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n" - ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n" - ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n" - "fmul v24.4s, v17.4s, v2.s[0]\n" - "scvtf v10.4s, v10.4s, #0x4\n" - "scvtf v29.4s, v29.4s, #0x4\n" - "scvtf v9.4s, v9.4s, #0x4\n" - "scvtf v20.4s, v20.4s, #0x4\n" - "fmla v15.4s, v10.4s, v24.4s\n" - "ldr q24, [x23, #0x0]\n" - "fmul v10.4s, v17.4s, v2.s[1]\n" - "fmla v19.4s, v29.4s, v10.4s\n" - "ldr q10, [x23, #0x10]\n" - "fmul v29.4s, v17.4s, v2.s[2]\n" - "fmul v2.4s, v17.4s, v2.s[3]\n" - "fmla v18.4s, v9.4s, v29.4s\n" - "movi v9.4s, #0x0\n" - "movi v29.4s, #0x0\n" - ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n" - ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n" - "fmla v14.4s, v20.4s, v2.4s\n" - "movi v20.4s, #0x0\n" - "movi v2.4s, #0x0\n" - ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n" - ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n" - "ldr q24, [x23, #0x20]\n" - ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n" - ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n" - ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n" - ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n" - "ldr q10, [x23, #0x30]\n" - ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n" - ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n" - ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n" - ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n" - "ldr q24, [x23, #0x40]\n" - ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n" - ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n" - ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n" - ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n" - "ldr q10, [x23, #0x50]\n" - ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n" - ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n" - ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n" - ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n" - "ldr q24, [x23, #0x60]\n" - ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n" - ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n" - ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n" - ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n" - "ldr q10, [x23, #0x70]\n" - "add x23, x23, #0x88\n" - ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n" - ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n" - ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n" - ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n" - "ldr q24, [x22, #0x0]\n" - ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n" - ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n" - ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n" - ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n" - "fmul v10.4s, v17.4s, v26.s[0]\n" - "scvtf v9.4s, v9.4s, #0x4\n" - "scvtf v29.4s, v29.4s, #0x4\n" - "scvtf v20.4s, v20.4s, #0x4\n" - "scvtf v2.4s, v2.4s, #0x4\n" - "fmla v11.4s, v9.4s, v10.4s\n" - "ldr q9, [x22, #0x10]\n" - "fmul v10.4s, v17.4s, v26.s[1]\n" - "fmla v13.4s, v29.4s, v10.4s\n" - "ldr d29, [x22, #-0x8]\n" - "fmul v10.4s, v17.4s, v26.s[2]\n" - "fmul v26.4s, v17.4s, v26.s[3]\n" - "fcvtl v29.4s, v29.4h\n" - "fmla v23.4s, v20.4s, v10.4s\n" - "movi v20.4s, #0x0\n" - "movi v10.4s, #0x0\n" - "fmla v16.4s, v2.4s, v26.4s\n" - "movi v26.4s, #0x0\n" - "movi v2.4s, #0x0\n" - ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n" - ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n" - ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n" - ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n" - "ldr q24, [x22, #0x20]\n" - ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n" - ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n" - ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n" - ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n" - "ldr q9, [x22, #0x30]\n" - ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n" - ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n" - ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n" - ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n" - "ldr q24, [x22, #0x40]\n" - ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n" - ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n" - ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n" - ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n" - "ldr q9, [x22, #0x50]\n" - ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n" - ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n" - ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n" - ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n" - "ldr q24, [x22, #0x60]\n" - ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n" - ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n" - ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n" - ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n" - "ldr q9, [x22, #0x70]\n" - "add x22, x22, #0x88\n" - ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n" - ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n" - ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n" - ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n" - "ldr q24, [x21, #0x0]\n" - ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n" - ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n" - ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n" - ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n" - "fmul v9.4s, v17.4s, v29.s[0]\n" - "scvtf v20.4s, v20.4s, #0x4\n" - "scvtf v10.4s, v10.4s, #0x4\n" - "scvtf v26.4s, v26.4s, #0x4\n" - "scvtf v2.4s, v2.4s, #0x4\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "ldr q9, [x21, #0x10]\n" - "fmul v20.4s, v17.4s, v29.s[1]\n" - "fmla v7.4s, v10.4s, v20.4s\n" - "ldr d20, [x21, #-0x8]\n" - "fmul v10.4s, v17.4s, v29.s[2]\n" - "fmul v29.4s, v17.4s, v29.s[3]\n" - "fcvtl v20.4s, v20.4h\n" - "fmla v0.4s, v26.4s, v10.4s\n" - "movi v26.4s, #0x0\n" - "movi v10.4s, #0x0\n" - "fmla v4.4s, v2.4s, v29.4s\n" - "movi v2.4s, #0x0\n" - "movi v29.4s, #0x0\n" - ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n" - ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n" - ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n" - ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n" - "ldr q12, [x21, #0x20]\n" - "fmul v24.4s, v17.4s, v20.s[0]\n" - ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n" - ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n" - ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n" - ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n" - "ldr q9, [x21, #0x30]\n" - "fmul v31.4s, v17.4s, v20.s[1]\n" - ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n" - ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n" - ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n" - ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n" - "ldr q12, [x21, #0x40]\n" - "fmul v6.4s, v17.4s, v20.s[2]\n" - "fmul v20.4s, v17.4s, v20.s[3]\n" - ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n" - ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n" - ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n" - ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n" - "ldr q9, [x21, #0x50]\n" - ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n" - ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n" - ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n" - ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n" - "ldr q12, [x21, #0x60]\n" - ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n" - ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n" - ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n" - ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n" - "ldr q17, [x21, #0x70]\n" - "add x21, x21, #0x88\n" - ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n" - ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n" - ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n" - ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n" - ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n" - ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n" - ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n" - ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n" - "scvtf v26.4s, v26.4s, #0x4\n" - "scvtf v10.4s, v10.4s, #0x4\n" - "fmla v5.4s, v26.4s, v24.4s\n" - "scvtf v2.4s, v2.4s, #0x4\n" - "scvtf v29.4s, v29.4s, #0x4\n" - "fmla v21.4s, v10.4s, v31.4s\n" - "fmla v8.4s, v2.4s, v6.4s\n" - "fmla v1.4s, v29.4s, v20.4s\n" - "bgt 3b\n" - "mov x20, %x[res_ptr]\n" - "subs x27, x27, #0x4\n" - "add %x[res_ptr], %x[res_ptr], #0x10\n" - "str q15, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q19, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q18, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q14, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q11, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q13, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q23, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q16, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q25, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q7, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q0, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q4, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q5, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q21, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q8, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q1, [x20, #0x0]\n" - "bne 2b\n" - "mov x20, #0x4\n" - "sub x10, x10, #0x10\n" - "cmp x10, #0x10\n" - "mov %x[res_ptr], x26\n" - "madd %x[a_ptr], x20, x9, %x[a_ptr]\n" - "bge 1b\n" - "4:" // Row loop skip - "cbz x10, 9f\n" - "5:" // Row tail: Row loop - "add x24, %x[b_ptr], #0x8\n" - "mov x23, %x[nc]\n" - "add x22, %x[res_ptr], %x[res_stride], LSL #2\n" - "6:" // Row tail: Column loop - "movi v15.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "add x25, %x[a_ptr], #0x8\n" - "mov x21, %x[nb]\n" - "movi v18.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "7:" // Row tail: Block loop - "ldr q7, [x24, #0x0]\n" - "ldr q5, [x25, #0x0]\n" - "movi v9.16b, #0x4\n" - "movi v4.4s, #0x0\n" - "ldr q3, [x24, #0x10]\n" - "ldr q2, [x25, #0x10]\n" - "movi v1.4s, #0x0\n" - "movi v0.4s, #0x0\n" - "ldr q13, [x24, #0x20]\n" - "ldr q31, [x25, #0x20]\n" - "movi v30.4s, #0x0\n" - "movi v29.16b, #0xf0\n" - "ldr q28, [x24, #0x30]\n" - "ldr q27, [x25, #0x30]\n" - "sshl v20.16b, v7.16b, v9.16b\n" - "sub x20, x24, #0x8\n" - "ldr q26, [x25, #0x40]\n" - "ldr q25, [x25, #0x50]\n" - "sshl v17.16b, v3.16b, v9.16b\n" - "and v7.16b, v7.16b, v29.16b\n" - "ldr q24, [x25, #0x60]\n" - "ldr q16, [x25, #0x70]\n" - "sshl v22.16b, v13.16b, v9.16b\n" - "and v3.16b, v3.16b, v29.16b\n" - "ldr d21, [x20, #0x0]\n" - "ldr d12, [x25, #-0x8]\n" - ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n" - ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n" - ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n" - ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n" - "sshl v9.16b, v28.16b, v9.16b\n" - "subs x21, x21, #0x1\n" - "and v13.16b, v13.16b, v29.16b\n" - "and v28.16b, v28.16b, v29.16b\n" - "add x25, x25, #0x88\n" - "add x24, x24, #0x48\n" - "fcvtl v21.4s, v21.4h\n" - "fcvtl v12.4s, v12.4h\n" - ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n" - ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n" - ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n" - ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n" - "fmul v11.4s, v21.4s, v12.s[0]\n" - "fmul v23.4s, v21.4s, v12.s[1]\n" - "fmul v17.4s, v21.4s, v12.s[2]\n" - ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n" - "fmul v6.4s, v21.4s, v12.s[3]\n" - ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n" - ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n" - ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n" - ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n" - ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n" - ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n" - ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n" - ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n" - ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n" - ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n" - ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n" - ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n" - ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n" - ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n" - ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n" - ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n" - ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n" - ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n" - ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n" - ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n" - ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n" - ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n" - ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n" - "scvtf v4.4s, v4.4s, #0x4\n" - "scvtf v1.4s, v1.4s, #0x4\n" - "scvtf v0.4s, v0.4s, #0x4\n" - "fmla v15.4s, v4.4s, v11.4s\n" - "scvtf v30.4s, v30.4s, #0x4\n" - "fmla v19.4s, v1.4s, v23.4s\n" - "fmla v18.4s, v0.4s, v17.4s\n" - "fmla v14.4s, v30.4s, v6.4s\n" - "bgt 7b\n" - "mov x20, %x[res_ptr]\n" - "cmp x10, #0x1\n" - "str q15, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "cmp x10, #0x2\n" - "str q19, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "cmp x10, #0x3\n" - "str q18, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "str q14, [x20, #0x0]\n" - "8:" // Row tail: Accumulator store skip - "subs x23, x23, #0x4\n" - "add %x[res_ptr], %x[res_ptr], #0x10\n" - "bne 6b\n" - "subs x10, x10, #0x4\n" - "add %x[a_ptr], %x[a_ptr], x9\n" - "mov %x[res_ptr], x22\n" - "bgt 5b\n" - "9:" // Row tail: Row loop skip - : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) - : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" - ); - return; - } -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - { - float sumf[4][4]; - int sumi; - - for (int y = 0; y < nr / 4; y++) { - const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; - } - for (int l = 0; l < nb; l++) { - for (int k = 0; k < (qk / (2 * blocklen)); k++) { - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) { - sumi = 0; - for (int i = 0; i < blocklen; ++i) { - const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); - const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); - sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + - (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; - } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); - } - } - } - } - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) - s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; - } - } - } - } -} - -static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 8; - - assert (n % qk == 0); - assert (nr % 4 == 0); - assert (nc % ncols_interleaved == 0); - - UNUSED(s); - UNUSED(bs); - UNUSED(vx); - UNUSED(vy); - UNUSED(nr); - UNUSED(nc); - UNUSED(nb); - UNUSED(ncols_interleaved); - UNUSED(blocklen); - -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) - if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - const void * b_ptr = vx; - const void * a_ptr = vy; - float * res_ptr = s; - size_t res_stride = bs * sizeof(float); - - __asm__ __volatile__( - "mov x10, %x[nr]\n" - "mov x9, #0x88\n" - "cmp x10, #0x10\n" - "mul x9, %x[nb], x9\n" - "blt 4f\n" - "1:" // Row loop - "add x28, %x[b_ptr], #0x8\n" - "mov x27, %x[nc]\n" - "add x26, %x[res_ptr], %x[res_stride], LSL #4\n" - "2:" // Column loop - "add x25, %x[a_ptr], #0x8\n" - "movi v2.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "mov x24, %x[nb]\n" - "add x23, x25, x9\n" - "movi v12.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "add x22, x23, x9\n" - "movi v11.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "add x21, x22, x9\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v5.16b, #0x0\n" - "movi v7.16b, #0x0\n" - "movi v4.16b, #0x0\n" - "movi v6.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "3:" // Block loop - "ldr q21, [x28, #0x0]\n" - "ldr q16, [x28, #0x10]\n" - "movi v1.16b, #0x4\n" - "movi v19.4s, #0x0\n" - "ldr q27, [x25, #0x0]\n" - "ldr q15, [x25, #0x10]\n" - "movi v26.4s, #0x0\n" - "movi v18.4s, #0x0\n" - "ldr q29, [x28, #0x20]\n" - "ldr q3, [x28, #0x30]\n" - "movi v17.4s, #0x0\n" - "movi v0.16b, #0xf0\n" - "ldr d20, [x25, #-0x8]\n" - "ldr d9, [x23, #-0x8]\n" - "sshl v8.16b, v21.16b, v1.16b\n" - "sshl v31.16b, v16.16b, v1.16b\n" - "and v21.16b, v21.16b, v0.16b\n" - "and v16.16b, v16.16b, v0.16b\n" - "sub x20, x28, #0x8\n" - "subs x24, x24, #0x1\n" - "add x28, x28, #0x48\n" - ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n" - ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n" - "ldr q27, [x25, #0x20]\n" - ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n" - ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n" - "sshl v15.16b, v29.16b, v1.16b\n" - "sshl v1.16b, v3.16b, v1.16b\n" - "and v29.16b, v29.16b, v0.16b\n" - "and v3.16b, v3.16b, v0.16b\n" - "ldr q0, [x25, #0x30]\n" - "fcvtl v20.4s, v20.4h\n" - ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n" - "fcvtl v9.4s, v9.4h\n" - ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n" - "ldr q27, [x25, #0x40]\n" - ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n" - ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n" - "ldr q0, [x25, #0x50]\n" - ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n" - ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n" - "ldr q27, [x25, #0x60]\n" - ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n" - ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n" - "ldr q0, [x25, #0x70]\n" - "add x25, x25, #0x88\n" - ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n" - ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n" - "ldr d27, [x20, #0x0]\n" - ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n" - ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n" - "fcvtl v27.4s, v27.4h\n" - "uzp1 v0.2d, v19.2d, v26.2d\n" - "uzp2 v26.2d, v19.2d, v26.2d\n" - "fmul v19.4s, v27.4s, v20.s[0]\n" - "scvtf v0.4s, v0.4s, #0x4\n" - "scvtf v26.4s, v26.4s, #0x4\n" - "fmla v2.4s, v0.4s, v19.4s\n" - "ldr q19, [x23, #0x0]\n" - "uzp1 v0.2d, v18.2d, v17.2d\n" - "uzp2 v18.2d, v18.2d, v17.2d\n" - "fmul v17.4s, v27.4s, v20.s[1]\n" - "scvtf v0.4s, v0.4s, #0x4\n" - "scvtf v18.4s, v18.4s, #0x4\n" - "fmla v10.4s, v26.4s, v17.4s\n" - "ldr q17, [x23, #0x10]\n" - "fmul v26.4s, v27.4s, v20.s[2]\n" - "fmul v20.4s, v27.4s, v20.s[3]\n" - "fmla v12.4s, v0.4s, v26.4s\n" - "ldr d0, [x22, #-0x8]\n" - "ldr d26, [x21, #-0x8]\n" - "fcvtl v0.4s, v0.4h\n" - "fmla v28.4s, v18.4s, v20.4s\n" - "movi v20.4s, #0x0\n" - "movi v18.4s, #0x0\n" - ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n" - ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n" - "ldr q19, [x23, #0x20]\n" - "fcvtl v26.4s, v26.4h\n" - ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n" - ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n" - "ldr q19, [x23, #0x40]\n" - ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n" - ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n" - "ldr q19, [x23, #0x60]\n" - ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n" - ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n" - "uzp1 v19.2d, v20.2d, v18.2d\n" - "scvtf v19.4s, v19.4s, #0x4\n" - "uzp2 v20.2d, v20.2d, v18.2d\n" - "fmul v18.4s, v27.4s, v9.s[0]\n" - "scvtf v20.4s, v20.4s, #0x4\n" - "fmla v11.4s, v19.4s, v18.4s\n" - "ldr q18, [x22, #0x0]\n" - "fmul v19.4s, v27.4s, v9.s[1]\n" - "fmla v13.4s, v20.4s, v19.4s\n" - "movi v19.4s, #0x0\n" - "movi v20.4s, #0x0\n" - ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n" - ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n" - "ldr q17, [x23, #0x30]\n" - ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n" - ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n" - "ldr q17, [x23, #0x50]\n" - ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n" - ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n" - "ldr q17, [x23, #0x70]\n" - "add x23, x23, #0x88\n" - ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n" - ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n" - "uzp1 v17.2d, v19.2d, v20.2d\n" - "scvtf v17.4s, v17.4s, #0x4\n" - "uzp2 v20.2d, v19.2d, v20.2d\n" - "fmul v19.4s, v27.4s, v9.s[2]\n" - "fmul v9.4s, v27.4s, v9.s[3]\n" - "scvtf v20.4s, v20.4s, #0x4\n" - "fmla v22.4s, v17.4s, v19.4s\n" - "ldr q17, [x22, #0x10]\n" - "movi v19.4s, #0x0\n" - ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n" - "fmla v23.4s, v20.4s, v9.4s\n" - "movi v20.4s, #0x0\n" - "movi v9.4s, #0x0\n" - ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n" - "ldr q18, [x22, #0x20]\n" - ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n" - ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n" - ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n" - "ldr q18, [x22, #0x40]\n" - ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n" - ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n" - "ldr q18, [x22, #0x60]\n" - ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n" - ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n" - "movi v18.4s, #0x0\n" - ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n" - "ldr q17, [x22, #0x30]\n" - ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n" - ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n" - "ldr q17, [x22, #0x50]\n" - ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n" - ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n" - "ldr q17, [x22, #0x70]\n" - "add x22, x22, #0x88\n" - ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n" - ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n" - "uzp1 v17.2d, v19.2d, v20.2d\n" - "uzp2 v20.2d, v19.2d, v20.2d\n" - "fmul v19.4s, v27.4s, v0.s[0]\n" - "scvtf v17.4s, v17.4s, #0x4\n" - "scvtf v20.4s, v20.4s, #0x4\n" - "fmla v25.4s, v17.4s, v19.4s\n" - "ldr q19, [x21, #0x0]\n" - "fmul v17.4s, v27.4s, v0.s[1]\n" - "fmla v5.4s, v20.4s, v17.4s\n" - "ldr q17, [x21, #0x10]\n" - "uzp1 v20.2d, v9.2d, v18.2d\n" - "uzp2 v9.2d, v9.2d, v18.2d\n" - "fmul v18.4s, v27.4s, v0.s[2]\n" - "fmul v0.4s, v27.4s, v0.s[3]\n" - "scvtf v20.4s, v20.4s, #0x4\n" - "scvtf v9.4s, v9.4s, #0x4\n" - "fmla v7.4s, v20.4s, v18.4s\n" - "movi v20.4s, #0x0\n" - "movi v18.4s, #0x0\n" - ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n" - ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n" - "ldr q19, [x21, #0x20]\n" - "fmla v4.4s, v9.4s, v0.4s\n" - "movi v9.4s, #0x0\n" - "movi v0.4s, #0x0\n" - ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n" - "fmul v8.4s, v27.4s, v26.s[0]\n" - ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n" - "ldr q17, [x21, #0x30]\n" - ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n" - "fmul v31.4s, v27.4s, v26.s[1]\n" - ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n" - "ldr q19, [x21, #0x40]\n" - ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n" - "fmul v15.4s, v27.4s, v26.s[2]\n" - "fmul v27.4s, v27.4s, v26.s[3]\n" - ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n" - "ldr q1, [x21, #0x50]\n" - ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n" - ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n" - "ldr q26, [x21, #0x60]\n" - ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n" - ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n" - "ldr q21, [x21, #0x70]\n" - "add x21, x21, #0x88\n" - ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n" - ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n" - ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n" - ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n" - "uzp1 v29.2d, v20.2d, v18.2d\n" - "uzp2 v21.2d, v20.2d, v18.2d\n" - "scvtf v29.4s, v29.4s, #0x4\n" - "uzp1 v18.2d, v9.2d, v0.2d\n" - "uzp2 v16.2d, v9.2d, v0.2d\n" - "scvtf v21.4s, v21.4s, #0x4\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "scvtf v18.4s, v18.4s, #0x4\n" - "scvtf v16.4s, v16.4s, #0x4\n" - "fmla v30.4s, v21.4s, v31.4s\n" - "fmla v24.4s, v18.4s, v15.4s\n" - "fmla v14.4s, v16.4s, v27.4s\n" - "bgt 3b\n" - "mov x20, %x[res_ptr]\n" - "subs x27, x27, #0x4\n" - "add %x[res_ptr], %x[res_ptr], #0x10\n" - "str q2, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q10, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q12, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q28, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q11, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q13, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q22, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q23, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q25, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q5, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q7, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q4, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q6, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q30, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q24, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "str q14, [x20, #0x0]\n" - "bne 2b\n" - "mov x20, #0x4\n" - "sub x10, x10, #0x10\n" - "cmp x10, #0x10\n" - "mov %x[res_ptr], x26\n" - "madd %x[a_ptr], x20, x9, %x[a_ptr]\n" - "bge 1b\n" - "4:" // Row loop skip - "cbz x10, 9f\n" - "5:" // Row tail: Row loop - "add x24, %x[b_ptr], #0x8\n" - "mov x23, %x[nc]\n" - "add x22, %x[res_ptr], %x[res_stride], LSL #2\n" - "6:" // Row tail: Column loop - "movi v2.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "add x25, %x[a_ptr], #0x8\n" - "mov x21, %x[nb]\n" - "movi v12.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "7:" // Row tail: Block loop - "ldr q6, [x24, #0x0]\n" - "ldr q5, [x24, #0x10]\n" - "movi v17.16b, #0x4\n" - "movi v8.4s, #0x0\n" - "ldr q4, [x25, #0x0]\n" - "ldr q13, [x25, #0x10]\n" - "movi v27.4s, #0x0\n" - "movi v0.4s, #0x0\n" - "ldr q31, [x24, #0x20]\n" - "ldr q14, [x24, #0x30]\n" - "movi v29.4s, #0x0\n" - "movi v22.16b, #0xf0\n" - "ldr q11, [x25, #0x20]\n" - "ldr q23, [x25, #0x30]\n" - "sshl v21.16b, v6.16b, v17.16b\n" - "sshl v16.16b, v5.16b, v17.16b\n" - "ldr q20, [x25, #0x40]\n" - "ldr q26, [x25, #0x50]\n" - "and v6.16b, v6.16b, v22.16b\n" - "and v5.16b, v5.16b, v22.16b\n" - "ldr q25, [x25, #0x60]\n" - "ldr q3, [x25, #0x70]\n" - "sshl v19.16b, v31.16b, v17.16b\n" - "sshl v18.16b, v14.16b, v17.16b\n" - "ldr d17, [x25, #-0x8]\n" - ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n" - ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n" - "and v31.16b, v31.16b, v22.16b\n" - ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n" - ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n" - "and v14.16b, v14.16b, v22.16b\n" - "sub x20, x24, #0x8\n" - "ldr d16, [x20, #0x0]\n" - "subs x21, x21, #0x1\n" - "add x25, x25, #0x88\n" - "fcvtl v17.4s, v17.4h\n" - "add x24, x24, #0x48\n" - ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n" - ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n" - ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n" - ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n" - "fcvtl v16.4s, v16.4h\n" - ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n" - ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n" - "fmul v23.4s, v16.4s, v17.s[0]\n" - "fmul v21.4s, v16.4s, v17.s[1]\n" - "fmul v1.4s, v16.4s, v17.s[2]\n" - "fmul v20.4s, v16.4s, v17.s[3]\n" - ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n" - ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n" - ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n" - ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n" - ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n" - ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n" - "uzp1 v19.2d, v8.2d, v27.2d\n" - "uzp2 v18.2d, v8.2d, v27.2d\n" - "scvtf v19.4s, v19.4s, #0x4\n" - "uzp1 v17.2d, v0.2d, v29.2d\n" - "uzp2 v16.2d, v0.2d, v29.2d\n" - "scvtf v18.4s, v18.4s, #0x4\n" - "fmla v2.4s, v19.4s, v23.4s\n" - "scvtf v17.4s, v17.4s, #0x4\n" - "scvtf v16.4s, v16.4s, #0x4\n" - "fmla v10.4s, v18.4s, v21.4s\n" - "fmla v12.4s, v17.4s, v1.4s\n" - "fmla v28.4s, v16.4s, v20.4s\n" - "bgt 7b\n" - "mov x20, %x[res_ptr]\n" - "cmp x10, #0x1\n" - "str q2, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "cmp x10, #0x2\n" - "str q10, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "cmp x10, #0x3\n" - "str q12, [x20, #0x0]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "str q28, [x20, #0x0]\n" - "8:" // Row tail: Accumulator store skip - "subs x23, x23, #0x4\n" - "add %x[res_ptr], %x[res_ptr], #0x10\n" - "bne 6b\n" - "subs x10, x10, #0x4\n" - "add %x[a_ptr], %x[a_ptr], x9\n" - "mov %x[res_ptr], x22\n" - "bgt 5b\n" - "9:" // Row tail: Row loop skip - : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) - : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" - ); - return; - } -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) - float sumf[4][4]; - int sumi; - - for (int y = 0; y < nr / 4; y++) { - const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; - } - for (int l = 0; l < nb; l++) { - for (int k = 0; k < (qk / (2 * blocklen)); k++) { - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) { - sumi = 0; - for (int i = 0; i < blocklen; ++i) { - const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); - const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); - sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + - (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; - } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); - } - } - } - } - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) - s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; - } - } - } -} - -static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 8; - const int blocklen = 8; - - assert (n % qk == 0); - assert (nr % 4 == 0); - assert (nc % ncols_interleaved == 0); - - UNUSED(s); - UNUSED(bs); - UNUSED(vx); - UNUSED(vy); - UNUSED(nr); - UNUSED(nc); - UNUSED(nb); - UNUSED(ncols_interleaved); - UNUSED(blocklen); - -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) -#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) { - const void * b_ptr = vx; - const void * a_ptr = vy; - float * res_ptr = s; - size_t res_stride = bs * sizeof(float); - - __asm__ __volatile__( - "mov x20, #0x4\n" - "mov x13, %x[nr]\n" - "mov z28.s, #-0x4\n" - "mov x12, #0x88\n" - "ptrue p1.b\n" - "whilelt p0.s, XZR, x20\n" - "cmp x13, #0x10\n" - "mul x12, %x[nb], x12\n" - "blt 4f\n" - "1:" // Row loop - "add x11, %x[b_ptr], #0x10\n" - "mov x10, %x[nc]\n" - "add x9, %x[res_ptr], %x[res_stride], LSL #4\n" - "2:" // Column loop - "add x28, %x[a_ptr], #0x8\n" - "mov z24.b, #0x0\n" - "mov z15.b, #0x0\n" - "mov x27, %x[nb]\n" - "add x26, x28, x12\n" - "mov z12.b, #0x0\n" - "mov z0.b, #0x0\n" - "add x25, x26, x12\n" - "mov z13.b, #0x0\n" - "mov z1.b, #0x0\n" - "add x24, x25, x12\n" - "mov z20.b, #0x0\n" - "mov z25.b, #0x0\n" - "mov z11.b, #0x0\n" - "mov z16.b, #0x0\n" - "mov z19.b, #0x0\n" - "mov z26.b, #0x0\n" - "mov z8.b, #0x0\n" - "mov z29.b, #0x0\n" - "mov z27.b, #0x0\n" - "mov z10.b, #0x0\n" - "3:" // Block loop - "ld1b { z30.b }, p1/Z, [x11]\n" - "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n" - "mov z18.s, #0x0\n" - "mov z7.s, #0x0\n" - "ld1rqb { z3.b }, p1/Z, [x28]\n" - "ld1rqb { z5.b }, p1/Z, [x28, #16]\n" - "mov z9.s, #0x0\n" - "mov z22.s, #0x0\n" - "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n" - "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n" - "sub x20, x11, #0x10\n" - "sub x23, x28, #0x8\n" - "lsl z31.b, z30.b, #0x4\n" - "lsl z6.b, z21.b, #0x4\n" - "ld1h { z23.s }, p1/Z, [x20]\n" - "sub x22, x26, #0x8\n" - "and z30.b, z30.b, #0xf0\n" - "and z21.b, z21.b, #0xf0\n" - "sub x21, x25, #0x8\n" - "sub x20, x24, #0x8\n" - "lsl z14.b, z4.b, #0x4\n" - "lsl z2.b, z17.b, #0x4\n" - "subs x27, x27, #0x1\n" - "add x11, x11, #0x90\n" - ".inst 0x451f9872 // smmla z18.s, z3.b, z31.b\n" - ".inst 0x45069867 // smmla z7.s, z3.b, z6.b\n" - "ld1rqb { z3.b }, p1/Z, [x28, #32]\n" - "and z4.b, z4.b, #0xf0\n" - ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n" - ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n" - "ld1rqb { z5.b }, p1/Z, [x28, #48]\n" - "and z17.b, z17.b, #0xf0\n" - "fcvt z23.s, p1/m, z23.h\n" - ".inst 0x450e9872 // smmla z18.s, z3.b, z14.b\n" - ".inst 0x45029867 // smmla z7.s, z3.b, z2.b\n" - "ld1rqb { z3.b }, p1/Z, [x28, #64]\n" - ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n" - ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n" - "ld1rqb { z5.b }, p1/Z, [x28, #80]\n" - "fscale z23.s, p1/m, z23.s, z28.s\n" - ".inst 0x451e9872 // smmla z18.s, z3.b, z30.b\n" - ".inst 0x45159867 // smmla z7.s, z3.b, z21.b\n" - "ld1rqb { z3.b }, p1/Z, [x28, #96]\n" - ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n" - ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n" - "ld1rqb { z5.b }, p1/Z, [x28, #112]\n" - "add x28, x28, #0x88\n" - ".inst 0x45049872 // smmla z18.s, z3.b, z4.b\n" - ".inst 0x45119867 // smmla z7.s, z3.b, z17.b\n" - "ld1h { z3.s }, p0/Z, [x23]\n" - ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n" - ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n" - "fcvt z3.s, p1/m, z3.h\n" - "uzp1 z5.d, z18.d, z7.d\n" - "uzp2 z18.d, z18.d, z7.d\n" - "mov z3.q, z3.q[0]\n" - "uzp1 z7.d, z9.d, z22.d\n" - "uzp2 z22.d, z9.d, z22.d\n" - "fmul z9.s, z23.s, z3.s[0]\n" - "scvtf z5.s, p1/m, z5.s\n" - "scvtf z18.s, p1/m, z18.s\n" - "scvtf z7.s, p1/m, z7.s\n" - "scvtf z22.s, p1/m, z22.s\n" - "fmla z24.s, p1/M, z5.s, z9.s\n" - "ld1rqb { z5.b }, p1/Z, [x26]\n" - "fmul z9.s, z23.s, z3.s[1]\n" - "fmla z15.s, p1/M, z18.s, z9.s\n" - "ld1rqb { z18.b }, p1/Z, [x26, #16]\n" - "fmul z9.s, z23.s, z3.s[2]\n" - "fmul z3.s, z23.s, z3.s[3]\n" - "fmla z12.s, p1/M, z7.s, z9.s\n" - "mov z9.s, #0x0\n" - "ld1h { z7.s }, p0/Z, [x22]\n" - ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n" - "fmla z0.s, p1/M, z22.s, z3.s\n" - "mov z22.s, #0x0\n" - "ld1h { z3.s }, p0/Z, [x21]\n" - ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n" - "ld1rqb { z5.b }, p1/Z, [x26, #32]\n" - "fcvt z7.s, p1/m, z7.h\n" - "fcvt z3.s, p1/m, z3.h\n" - ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n" - ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n" - "ld1rqb { z5.b }, p1/Z, [x26, #64]\n" - "mov z7.q, z7.q[0]\n" - "mov z3.q, z3.q[0]\n" - ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n" - ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n" - "ld1rqb { z5.b }, p1/Z, [x26, #96]\n" - ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n" - ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n" - "uzp1 z5.d, z9.d, z22.d\n" - "scvtf z5.s, p1/m, z5.s\n" - "uzp2 z22.d, z9.d, z22.d\n" - "fmul z9.s, z23.s, z7.s[0]\n" - "scvtf z22.s, p1/m, z22.s\n" - "fmla z13.s, p1/M, z5.s, z9.s\n" - "ld1rqb { z9.b }, p1/Z, [x25]\n" - "fmul z5.s, z23.s, z7.s[1]\n" - "fmla z1.s, p1/M, z22.s, z5.s\n" - "mov z5.s, #0x0\n" - "mov z22.s, #0x0\n" - ".inst 0x451f9a45 // smmla z5.s, z18.b, z31.b\n" - ".inst 0x45069a56 // smmla z22.s, z18.b, z6.b\n" - "ld1rqb { z18.b }, p1/Z, [x26, #48]\n" - ".inst 0x450e9a45 // smmla z5.s, z18.b, z14.b\n" - ".inst 0x45029a56 // smmla z22.s, z18.b, z2.b\n" - "ld1rqb { z18.b }, p1/Z, [x26, #80]\n" - ".inst 0x451e9a45 // smmla z5.s, z18.b, z30.b\n" - ".inst 0x45159a56 // smmla z22.s, z18.b, z21.b\n" - "ld1rqb { z18.b }, p1/Z, [x26, #112]\n" - "add x26, x26, #0x88\n" - ".inst 0x45049a45 // smmla z5.s, z18.b, z4.b\n" - ".inst 0x45119a56 // smmla z22.s, z18.b, z17.b\n" - "uzp1 z18.d, z5.d, z22.d\n" - "scvtf z18.s, p1/m, z18.s\n" - "uzp2 z22.d, z5.d, z22.d\n" - "fmul z5.s, z23.s, z7.s[2]\n" - "fmul z7.s, z23.s, z7.s[3]\n" - "scvtf z22.s, p1/m, z22.s\n" - "fmla z20.s, p1/M, z18.s, z5.s\n" - "ld1rqb { z18.b }, p1/Z, [x25, #16]\n" - "ld1h { z5.s }, p0/Z, [x20]\n" - "fcvt z5.s, p1/m, z5.h\n" - "fmla z25.s, p1/M, z22.s, z7.s\n" - "mov z22.s, #0x0\n" - "mov z7.s, #0x0\n" - ".inst 0x451f9936 // smmla z22.s, z9.b, z31.b\n" - ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n" - "ld1rqb { z9.b }, p1/Z, [x25, #32]\n" - "mov z5.q, z5.q[0]\n" - ".inst 0x450e9936 // smmla z22.s, z9.b, z14.b\n" - ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n" - "ld1rqb { z9.b }, p1/Z, [x25, #64]\n" - ".inst 0x451e9936 // smmla z22.s, z9.b, z30.b\n" - ".inst 0x45159927 // smmla z7.s, z9.b, z21.b\n" - "ld1rqb { z9.b }, p1/Z, [x25, #96]\n" - ".inst 0x45049936 // smmla z22.s, z9.b, z4.b\n" - ".inst 0x45119927 // smmla z7.s, z9.b, z17.b\n" - "uzp1 z9.d, z22.d, z7.d\n" - "scvtf z9.s, p1/m, z9.s\n" - "uzp2 z22.d, z22.d, z7.d\n" - "fmul z7.s, z23.s, z3.s[0]\n" - "scvtf z22.s, p1/m, z22.s\n" - "fmla z11.s, p1/M, z9.s, z7.s\n" - "ld1rqb { z9.b }, p1/Z, [x24]\n" - "fmul z7.s, z23.s, z3.s[1]\n" - "fmla z16.s, p1/M, z22.s, z7.s\n" - "mov z22.s, #0x0\n" - "mov z7.s, #0x0\n" - ".inst 0x451f9a56 // smmla z22.s, z18.b, z31.b\n" - ".inst 0x45069a47 // smmla z7.s, z18.b, z6.b\n" - "ld1rqb { z18.b }, p1/Z, [x25, #48]\n" - ".inst 0x450e9a56 // smmla z22.s, z18.b, z14.b\n" - ".inst 0x45029a47 // smmla z7.s, z18.b, z2.b\n" - "ld1rqb { z18.b }, p1/Z, [x25, #80]\n" - ".inst 0x451e9a56 // smmla z22.s, z18.b, z30.b\n" - ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n" - "ld1rqb { z18.b }, p1/Z, [x25, #112]\n" - "add x25, x25, #0x88\n" - ".inst 0x45049a56 // smmla z22.s, z18.b, z4.b\n" - ".inst 0x45119a47 // smmla z7.s, z18.b, z17.b\n" - "uzp1 z18.d, z22.d, z7.d\n" - "scvtf z18.s, p1/m, z18.s\n" - "uzp2 z7.d, z22.d, z7.d\n" - "fmul z22.s, z23.s, z3.s[2]\n" - "fmul z3.s, z23.s, z3.s[3]\n" - "scvtf z7.s, p1/m, z7.s\n" - "fmla z19.s, p1/M, z18.s, z22.s\n" - "ld1rqb { z18.b }, p1/Z, [x24, #16]\n" - "fmul z22.s, z23.s, z5.s[0]\n" - "fmla z26.s, p1/M, z7.s, z3.s\n" - "mov z3.s, #0x0\n" - "mov z7.s, #0x0\n" - ".inst 0x451f9923 // smmla z3.s, z9.b, z31.b\n" - ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n" - "ld1rqb { z9.b }, p1/Z, [x24, #32]\n" - ".inst 0x450e9923 // smmla z3.s, z9.b, z14.b\n" - ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n" - "mov z9.s, #0x0\n" - ".inst 0x451f9a49 // smmla z9.s, z18.b, z31.b\n" - "mov z31.s, #0x0\n" - ".inst 0x45069a5f // smmla z31.s, z18.b, z6.b\n" - "ld1rqb { z6.b }, p1/Z, [x24, #48]\n" - "ld1rqb { z18.b }, p1/Z, [x24, #64]\n" - ".inst 0x450e98c9 // smmla z9.s, z6.b, z14.b\n" - "fmul z14.s, z23.s, z5.s[1]\n" - ".inst 0x450298df // smmla z31.s, z6.b, z2.b\n" - "ld1rqb { z6.b }, p1/Z, [x24, #80]\n" - "fmul z2.s, z23.s, z5.s[2]\n" - "fmul z23.s, z23.s, z5.s[3]\n" - ".inst 0x451e9a43 // smmla z3.s, z18.b, z30.b\n" - ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n" - "ld1rqb { z5.b }, p1/Z, [x24, #96]\n" - ".inst 0x451e98c9 // smmla z9.s, z6.b, z30.b\n" - ".inst 0x451598df // smmla z31.s, z6.b, z21.b\n" - "ld1rqb { z18.b }, p1/Z, [x24, #112]\n" - "add x24, x24, #0x88\n" - ".inst 0x450498a3 // smmla z3.s, z5.b, z4.b\n" - ".inst 0x451198a7 // smmla z7.s, z5.b, z17.b\n" - ".inst 0x45049a49 // smmla z9.s, z18.b, z4.b\n" - ".inst 0x45119a5f // smmla z31.s, z18.b, z17.b\n" - "uzp1 z18.d, z3.d, z7.d\n" - "uzp2 z5.d, z3.d, z7.d\n" - "scvtf z18.s, p1/m, z18.s\n" - "uzp1 z6.d, z9.d, z31.d\n" - "uzp2 z9.d, z9.d, z31.d\n" - "scvtf z5.s, p1/m, z5.s\n" - "fmla z8.s, p1/M, z18.s, z22.s\n" - "scvtf z6.s, p1/m, z6.s\n" - "scvtf z9.s, p1/m, z9.s\n" - "fmla z29.s, p1/M, z5.s, z14.s\n" - "fmla z27.s, p1/M, z6.s, z2.s\n" - "fmla z10.s, p1/M, z9.s, z23.s\n" - "bgt 3b\n" - "mov x20, %x[res_ptr]\n" - "subs x10, x10, #0x8\n" - "add %x[res_ptr], %x[res_ptr], #0x20\n" - "st1w { z24.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z15.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z12.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z0.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z13.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z1.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z20.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z25.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z11.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z16.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z19.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z26.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z8.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z29.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z27.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "st1w { z10.s }, p1, [x20]\n" - "bne 2b\n" - "mov x20, #0x4\n" - "sub x13, x13, #0x10\n" - "cmp x13, #0x10\n" - "mov %x[res_ptr], x9\n" - "madd %x[a_ptr], x20, x12, %x[a_ptr]\n" - "bge 1b\n" - "4:" // Row loop skip - "cbz x13, 9f\n" - "5:" // Row tail: Row loop - "add x25, %x[b_ptr], #0x10\n" - "mov x24, %x[nc]\n" - "add x23, %x[res_ptr], %x[res_stride], LSL #2\n" - "6:" // Row tail: Column loop - "mov z24.b, #0x0\n" - "mov z15.b, #0x0\n" - "add x28, %x[a_ptr], #0x8\n" - "mov x22, %x[nb]\n" - "mov z12.b, #0x0\n" - "mov z0.b, #0x0\n" - "7:" // Row tail: Block loop - "ld1b { z3.b }, p1/Z, [x25]\n" - "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n" - "mov z2.s, #0x0\n" - "mov z25.s, #0x0\n" - "ld1rqb { z26.b }, p1/Z, [x28]\n" - "ld1rqb { z21.b }, p1/Z, [x28, #16]\n" - "mov z27.s, #0x0\n" - "mov z19.s, #0x0\n" - "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n" - "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n" - "sub x21, x25, #0x10\n" - "sub x20, x28, #0x8\n" - "lsl z20.b, z3.b, #0x4\n" - "lsl z4.b, z6.b, #0x4\n" - "ld1rqb { z10.b }, p1/Z, [x28, #32]\n" - "ld1rqb { z23.b }, p1/Z, [x28, #48]\n" - "and z3.b, z3.b, #0xf0\n" - "and z6.b, z6.b, #0xf0\n" - "ld1rqb { z11.b }, p1/Z, [x28, #64]\n" - "ld1rqb { z7.b }, p1/Z, [x28, #80]\n" - "lsl z8.b, z29.b, #0x4\n" - "lsl z14.b, z16.b, #0x4\n" - "ld1rqb { z18.b }, p1/Z, [x28, #96]\n" - "ld1rqb { z30.b }, p1/Z, [x28, #112]\n" - ".inst 0x45149b42 // smmla z2.s, z26.b, z20.b\n" - ".inst 0x45049b59 // smmla z25.s, z26.b, z4.b\n" - "and z29.b, z29.b, #0xf0\n" - "ld1h { z17.s }, p1/Z, [x21]\n" - ".inst 0x45149abb // smmla z27.s, z21.b, z20.b\n" - ".inst 0x45049ab3 // smmla z19.s, z21.b, z4.b\n" - "and z16.b, z16.b, #0xf0\n" - "ld1h { z4.s }, p0/Z, [x20]\n" - "subs x22, x22, #0x1\n" - "add x28, x28, #0x88\n" - "fcvt z17.s, p1/m, z17.h\n" - "add x25, x25, #0x90\n" - ".inst 0x45089942 // smmla z2.s, z10.b, z8.b\n" - ".inst 0x450e9959 // smmla z25.s, z10.b, z14.b\n" - "fcvt z4.s, p1/m, z4.h\n" - ".inst 0x45089afb // smmla z27.s, z23.b, z8.b\n" - ".inst 0x450e9af3 // smmla z19.s, z23.b, z14.b\n" - "fscale z17.s, p1/m, z17.s, z28.s\n" - "mov z4.q, z4.q[0]\n" - ".inst 0x45039962 // smmla z2.s, z11.b, z3.b\n" - ".inst 0x45069979 // smmla z25.s, z11.b, z6.b\n" - "fmul z23.s, z17.s, z4.s[0]\n" - "fmul z9.s, z17.s, z4.s[1]\n" - "fmul z21.s, z17.s, z4.s[2]\n" - "fmul z4.s, z17.s, z4.s[3]\n" - ".inst 0x450398fb // smmla z27.s, z7.b, z3.b\n" - ".inst 0x450698f3 // smmla z19.s, z7.b, z6.b\n" - ".inst 0x451d9a42 // smmla z2.s, z18.b, z29.b\n" - ".inst 0x45109a59 // smmla z25.s, z18.b, z16.b\n" - ".inst 0x451d9bdb // smmla z27.s, z30.b, z29.b\n" - ".inst 0x45109bd3 // smmla z19.s, z30.b, z16.b\n" - "uzp1 z31.d, z2.d, z25.d\n" - "uzp2 z13.d, z2.d, z25.d\n" - "scvtf z31.s, p1/m, z31.s\n" - "uzp1 z17.d, z27.d, z19.d\n" - "uzp2 z18.d, z27.d, z19.d\n" - "scvtf z13.s, p1/m, z13.s\n" - "fmla z24.s, p1/M, z31.s, z23.s\n" - "scvtf z17.s, p1/m, z17.s\n" - "scvtf z18.s, p1/m, z18.s\n" - "fmla z15.s, p1/M, z13.s, z9.s\n" - "fmla z12.s, p1/M, z17.s, z21.s\n" - "fmla z0.s, p1/M, z18.s, z4.s\n" - "bgt 7b\n" - "mov x20, %x[res_ptr]\n" - "cmp x13, #0x1\n" - "st1w { z24.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "cmp x13, #0x2\n" - "st1w { z15.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "cmp x13, #0x3\n" - "st1w { z12.s }, p1, [x20]\n" - "add x20, x20, %x[res_stride]\n" - "ble 8f\n" - "st1w { z0.s }, p1, [x20]\n" - "8:" // Row tail: Accumulator store skip - "subs x24, x24, #0x8\n" - "add %x[res_ptr], %x[res_ptr], #0x20\n" - "bne 6b\n" - "subs x13, x13, #0x4\n" - "add %x[a_ptr], %x[a_ptr], x12\n" - "mov %x[res_ptr], x23\n" - "bgt 5b\n" - "9:" // Row tail: Row loop skip - : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr) - : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc) - : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" - ); - return; - } -#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) -#elif defined(__AVX2__) || defined(__AVX512F__) - { - const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx; - const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy; - int64_t b_nb = n / QK4_0; - int64_t y = 0; - // Mask to mask out nibbles from packed bytes - const __m256i m4b = _mm256_set1_epi8(0x0F); - const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3); - // Lookup table to convert signed nibbles to signed bytes - __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0)); - signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0); - // Permute mask used for easier vector processing at later stages - __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4); - int64_t xstart = 0; - int anr = nr - nr%16; // Used to align nr with boundary of 16 - #ifdef __AVX512F__ - int anc = nc - nc%16; // Used to align nc with boundary of 16 - // Mask to mask out nibbles from packed bytes expanded to 512 bit length - const __m512i m4bexpanded = _mm512_set1_epi8(0x0F); - // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length - __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1); - - // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation - for (; y < anr / 4; y += 4) { - - const block_q8_0x4 * a_ptrs[4]; - - a_ptrs[0] = a_ptr_start + (y * nb); - for (int i = 0; i < 3; ++i) { - a_ptrs[i + 1] = a_ptrs[i] + nb; - } - - // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation - for (int64_t x = 0; x < anc / 8; x += 2) { - - const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x) * b_nb); - const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb); - - // Master FP accumulators - __m512 acc_rows[16]; - for (int i = 0; i < 16; i++) { - acc_rows[i] = _mm512_setzero_ps(); + // Master FP accumulators + __m512 acc_rows[16]; + for (int i = 0; i < 16; i++) { + acc_rows[i] = _mm512_setzero_ps(); } for (int64_t b = 0; b < nb; b++) { @@ -3783,207 +1733,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c } return; } -#elif defined __riscv_v - if (__riscv_vlenb() >= QK4_0) { - const size_t vl = QK4_0; - - for (int y = 0; y < nr / 4; y++) { - const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); - vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); - vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); - vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); - vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4); - for (int l = 0; l < nb; l++) { - const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4); - const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4); - const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4); - const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0); - const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1); - const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0); - const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1); - - // vector version needs Zvfhmin extension - const float a_scales[4] = { - GGML_FP16_TO_FP32(a_ptr[l].d[0]), - GGML_FP16_TO_FP32(a_ptr[l].d[1]), - GGML_FP16_TO_FP32(a_ptr[l].d[2]), - GGML_FP16_TO_FP32(a_ptr[l].d[3]) - }; - const float b_scales[8] = { - GGML_FP16_TO_FP32(b_ptr[l].d[0]), - GGML_FP16_TO_FP32(b_ptr[l].d[1]), - GGML_FP16_TO_FP32(b_ptr[l].d[2]), - GGML_FP16_TO_FP32(b_ptr[l].d[3]), - GGML_FP16_TO_FP32(b_ptr[l].d[4]), - GGML_FP16_TO_FP32(b_ptr[l].d[5]), - GGML_FP16_TO_FP32(b_ptr[l].d[6]), - GGML_FP16_TO_FP32(b_ptr[l].d[7]) - }; - const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); - - const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0]; - const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32]; - const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64]; - const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96]; - __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment - vint16m4_t sumi_l0; - { - const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4)); - const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4)); - const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4)); - const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4)); - const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); - const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); - const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); - const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); - - sumi_l0 = sumi_hi_m; - } - - { - const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0)); - const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); - const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); - const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); - const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); - const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); - const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); - const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); - const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); - const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); - const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); - const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); - const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); - - const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4); - sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4); - } - - const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8]; - const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40]; - const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72]; - const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104]; - __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment - vint16m4_t sumi_l1; - { - const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4)); - const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4)); - const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4)); - const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4)); - const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); - const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); - const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); - const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); - - sumi_l1 = sumi_hi_m; - } - - { - const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1)); - const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); - const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); - const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); - const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); - const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); - const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); - const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); - const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); - const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); - const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); - const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); - const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); - - const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4); - sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4); - } - - const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16]; - const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48]; - const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80]; - const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112]; - __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment - vint16m4_t sumi_l2; - { - const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4)); - const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4)); - const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4)); - const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4)); - const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); - const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); - const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); - const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); - - sumi_l2 = sumi_hi_m; - } - - { - const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2)); - const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); - const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); - const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); - const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); - const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); - const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); - const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); - const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); - const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); - const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); - const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); - const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); - - const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4); - sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4); - } - - const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24]; - const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56]; - const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88]; - const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120]; - __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment - vint16m4_t sumi_l3; - { - const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4)); - const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4)); - const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4)); - const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4)); - const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2); - const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2); - const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2); - const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2); - - sumi_l3 = sumi_hi_m; - } - { - const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3)); - const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl); - const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl); - const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl); - const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2); - const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2); - const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2); - const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2); - const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4); - const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4)); - const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4)); - const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4); - const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); - - const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4); - sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4); - } - } - __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4); - __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4); - __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4); - __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4); - } - } - - return; - } #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) float sumf[4][8]; int sumi; @@ -4006,7 +1756,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -4019,7 +1769,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c } } -static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { +void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK_K; const int nb = n / qk; const int ncols_interleaved = 8; @@ -5510,7 +3260,7 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; } } } @@ -5519,7 +3269,7 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c for(int m = 0; m < 4; m++) { const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); for(int j = 0; j < ncols_interleaved; j++) { - sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; + sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; } } } @@ -5533,899 +3283,3 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c } #endif } - -static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { - const int qk = QK8_0; - const int nb = n / qk; - const int ncols_interleaved = 4; - const int blocklen = 4; - - assert (n % qk == 0); - assert (nr % 4 == 0); - assert (nc % ncols_interleaved == 0); - - UNUSED(s); - UNUSED(bs); - UNUSED(vx); - UNUSED(vy); - UNUSED(nr); - UNUSED(nc); - UNUSED(nb); - UNUSED(ncols_interleaved); - UNUSED(blocklen); - -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) - if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); - - for (int y = 0; y < nr / 4; y++) { - const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); - - float32x4_t sumf[4]; - for (int m = 0; m < 4; m++) { - sumf[m] = vdupq_n_f32(0); - } - - for (int l = 0; l < nb; l++) { - float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d)); - float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d)); - - int32x4_t sumi_0 = vdupq_n_s32(0); - int32x4_t sumi_1 = vdupq_n_s32(0); - int32x4_t sumi_2 = vdupq_n_s32(0); - int32x4_t sumi_3 = vdupq_n_s32(0); - - for (int k = 0; k < 4; k++) { - int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0); - int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64); - - uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k); - int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4); - int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF); - - sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0); - sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1); - sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2); - sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3); - sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0); - sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1); - sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2); - sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3); - } - - sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0)); - sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1)); - sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2)); - sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3)); - } - - for (int m = 0; m < 4; m++) { - vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]); - } - } - } - return; - } -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - { - float sumf[4][4]; - int sumi; - - for (int y = 0; y < nr / 4; y++) { - const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); - for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; - } - for (int l = 0; l < nb; l++) { - for (int k = 0; k < (qk / (2 * blocklen)); k++) { - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) { - sumi = 0; - for (int i = 0; i < blocklen; ++i) { - const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; - const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; - sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + - (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); - } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); - } - } - } - } - for (int m = 0; m < 4; m++) { - for (int j = 0; j < ncols_interleaved; j++) - s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; - } - } - } - } -} - -static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { - block_q4_0x4 out; - - for (int i = 0; i < 4; i++) { - out.d[i] = in[i].d; - } - - const int end = QK4_0 * 2 / blck_size_interleave; - - if (blck_size_interleave == 8) { - const uint64_t xor_mask = 0x8888888888888888ULL; - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint64_t elems; - // Using memcpy to avoid unaligned memory accesses - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); - } - } else if (blck_size_interleave == 4) { - const uint32_t xor_mask = 0x88888888; - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint32_t elems; - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t)); - } - } else { - GGML_ASSERT(false); - } - - return out; -} - -// interleave 8 block_q4_0s in blocks of blck_size_interleave -// returns an interleaved block_q4_0x8 -// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks -// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave -static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) { - block_q4_0x8 out; - - for (int i = 0; i < 8; i++) { - out.d[i] = in[i].d; - } - - const int end = QK4_0 * 4 / blck_size_interleave; - const uint64_t xor_mask = 0x8888888888888888ULL; - - for (int i = 0; i < end; ++i) { - int src_id = i % 8; - int src_offset = (i / 8) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint64_t elems; - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); - } - - return out; -} - -static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) { - block_q4_Kx8 out; - //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure - for (int i = 0; i < 8; i++) { - out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d; - } - - for (int i = 0; i < 8; i++) { - out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin; - } - - const int end = QK_K * 4 / blck_size_interleave; - - // Interleave Q4_K quants by taking 8 bytes at a time - for (int i = 0; i < end; ++i) { - int src_id = i % 8; - int src_offset = (i / 8) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint64_t elems; - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); - memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); - } - - // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K - // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value) - // The output Q4_Kx8 structure has 96 bytes - // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure - // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures - uint8_t s[8], m[8]; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 8; j++) { - s[j] = in[j].scales[i] & 63; - m[j] = in[j].scales[i + 4] & 63; - } - - out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2); - out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2); - out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2); - out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2); - out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2); - out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2); - out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2); - out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2); - out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4); - out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4); - out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4); - out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4); - - } - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 8; j++) { - s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15); - m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4); - } - - out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2); - out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2); - out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2); - out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2); - out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2); - out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2); - out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2); - out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2); - out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4); - out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4); - out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4); - out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4); - - } - - return out; -} - -static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { - GGML_ASSERT(t->type == GGML_TYPE_Q4_0); - GGML_ASSERT(interleave_block == 4 || interleave_block == 8); - constexpr int nrows_interleaved = 4; - - block_q4_0x4 * dst = (block_q4_0x4 *)t->data; - const block_q4_0 * src = (const block_q4_0 *)data; - block_q4_0 dst_tmp[4]; - int nrow = ggml_nrows(t); - int nblocks = t->ne[0] / QK4_0; - - GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); - - if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { - return -1; - } - - for (int b = 0; b < nrow; b += nrows_interleaved) { - for (int64_t x = 0; x < nblocks; x++) { - for (int i = 0; i < nrows_interleaved; i++) { - dst_tmp[i] = src[x + i * nblocks]; - } - *dst++ = make_block_q4_0x4(dst_tmp, interleave_block); - } - src += nrows_interleaved * nblocks; - } - return 0; - - GGML_UNUSED(data_size); -} -static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { - GGML_ASSERT(t->type == GGML_TYPE_Q4_K); - GGML_ASSERT(interleave_block == 8); - constexpr int nrows_interleaved = 8; - - block_q4_Kx8 * dst = (block_q4_Kx8*)t->data; - const block_q4_K * src = (const block_q4_K*) data; - block_q4_K dst_tmp[8]; - int nrow = ggml_nrows(t); - int nblocks = t->ne[0] / QK_K; - - GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K)); - - if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { - return -1; - } - - for (int b = 0; b < nrow; b += nrows_interleaved) { - for (int64_t x = 0; x < nblocks; x++) { - for (int i = 0; i < nrows_interleaved; i++ ) { - dst_tmp[i] = src[x + i * nblocks]; - } - *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block); - } - src += nrows_interleaved * nblocks; - } - return 0; - - GGML_UNUSED(data_size); -} - -static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { - GGML_ASSERT(t->type == GGML_TYPE_Q4_0); - GGML_ASSERT(interleave_block == 8); - constexpr int nrows_interleaved = 8; - - block_q4_0x8 * dst = (block_q4_0x8*)t->data; - const block_q4_0 * src = (const block_q4_0*) data; - block_q4_0 dst_tmp[8]; - int nrow = ggml_nrows(t); - int nblocks = t->ne[0] / QK4_0; - - GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); - - if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { - return -1; - } - - for (int b = 0; b < nrow; b += nrows_interleaved) { - for (int64_t x = 0; x < nblocks; x++) { - for (int i = 0; i < nrows_interleaved; i++ ) { - dst_tmp[i] = src[x + i * nblocks]; - } - *dst++ = make_block_q4_0x8(dst_tmp, interleave_block); - } - src += nrows_interleaved * nblocks; - } - return 0; - - GGML_UNUSED(data_size); -} - -static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) { - block_iq4_nlx4 out; - - for (int i = 0; i < 4; i++) { - out.d[i] = in[i].d; - } - - const int end = QK4_NL * 2 / blck_size_interleave; - - // TODO: this branch seems wrong - //if (blck_size_interleave == 8) { - // for (int i = 0; i < end; ++i) { - // int src_id = i % 4; - // int src_offset = (i / 4) * blck_size_interleave; - // int dst_offset = i * blck_size_interleave; - - // // Using memcpy to avoid unaligned memory accesses - // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); - // } - //} else - if (blck_size_interleave == 4) { - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t)); - } - } else { - GGML_ASSERT(false); - } - - return out; -} - -static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { - GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); - //GGML_ASSERT(interleave_block == 4 || interleave_block == 8); - GGML_ASSERT(interleave_block == 4); - - block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; - const block_iq4_nl * src = (const block_iq4_nl *)data; - block_iq4_nl dst_tmp[4]; - int nrow = ggml_nrows(t); - int nrows_interleaved = 4; - int nblocks = t->ne[0] / QK4_0; - - GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); - - if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { - return -1; - } - - for (int b = 0; b < nrow; b += nrows_interleaved) { - for (int64_t x = 0; x < nblocks; x++) { - for (int i = 0; i < nrows_interleaved; i++) { - dst_tmp[i] = src[x + i * nblocks]; - } - *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block); - } - src += nrows_interleaved * nblocks; - } - return 0; - - GGML_UNUSED(data_size); -} - -namespace ggml::cpu::aarch64 { -// repack -template -int repack(struct ggml_tensor *, const void *, size_t); - -// TODO: generalise. -template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); -} - -template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); -} - -template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); -} - -template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size); -} - -template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); -} - -// TODO: needs to be revisited -//template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { -// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); -//} - -// gemv -template -void gemv(int, float *, size_t, const void *, const void *, int, int); - -template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); -} - -template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); -} - -template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); -} - -template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); -} - -template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); -} - -// gemm -template -void gemm(int, float *, size_t, const void *, const void *, int, int); - -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); -} - -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); -} - -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); -} - -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); -} - -template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); -} - -class tensor_traits_base : public ggml::cpu::tensor_traits { - public: - virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; -}; - -template class tensor_traits : public tensor_traits_base { - - bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { - // not realy a GGML_TYPE_Q8_0 but same size. - switch (op->op) { - case GGML_OP_MUL_MAT: - size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); - return true; - case GGML_OP_MUL_MAT_ID: - size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); - size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc. - size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2]; - return true; - default: - // GGML_ABORT("fatal error"); - break; - } - return false; - } - - bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { - switch (op->op) { - case GGML_OP_MUL_MAT: - forward_mul_mat(params, op); - return true; - case GGML_OP_MUL_MAT_ID: - forward_mul_mat_id(params, op); - return true; - default: - // GGML_ABORT("fatal error"); - break; - } - return false; - } - - void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) { - const ggml_tensor * src0 = op->src[0]; - const ggml_tensor * src1 = op->src[1]; - ggml_tensor * dst = op; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); - // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); - - char * wdata = static_cast(params->wdata); - const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); - - assert(params->wsize >= nbw1 * ne11); - - const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; - - int64_t i11_processed = 0; - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { - ggml_quantize_mat_t((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10); - } - - i11_processed = ne11 - ne11 % 4; - for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { - from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); - } - - ggml_barrier(params->threadpool); - - const void * src1_wdata = params->wdata; - const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); - int64_t src0_start = (ith * ne01) / nth; - int64_t src0_end = ((ith + 1) * ne01) / nth; - src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start; - src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end; - if (src0_start >= src0_end) { - return; - } - - // If there are more than three rows in src1, use gemm; otherwise, use gemv. - if (ne11 > 3) { - gemm(ne00, - (float *) ((char *) dst->data) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); - } - for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { - gemv(ne00, - (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata + (src1_col_stride * iter), 1, - src0_end - src0_start); - } - } - - void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) { - const ggml_tensor * src0 = op->src[0]; - const ggml_tensor * src1 = op->src[1]; - const ggml_tensor * ids = op->src[2]; - ggml_tensor * dst = op; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(src0->type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(ne03 == 1); - GGML_ASSERT(ne13 == 1); - GGML_ASSERT(ne3 == 1); - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - // row groups - const int n_ids = ids->ne[0]; // n_expert_used - const int n_as = ne02; // n_expert - - const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); - const size_t nbw2 = nbw1*ne11; - const size_t nbw3 = nbw2*ne12; - - struct mmid_row_mapping { - int32_t i1; - int32_t i2; - }; - - GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) + - n_as * ne12 * sizeof(mmid_row_mapping))); - - auto * wdata = (char *) params->wdata; - auto * wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t)); - auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] - - struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12] - - // src1: float32 => param type - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11), - (void *) (wdata + i12 * nbw2 + i11 * nbw1), - ne10); - } - } - -#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)] - - if (ith == 0) { - // initialize matrix_row_counts - memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); - - // group rows by src0 matrix - for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { - for (int32_t id = 0; id < n_ids; ++id) { - const int32_t i02 = - *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); - - GGML_ASSERT(i02 >= 0 && i02 < n_as); - - MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 }; - matrix_row_counts[i02] += 1; - } - } - } - - ggml_barrier(params->threadpool); - - // compute each matrix multiplication in sequence - for (int cur_a = 0; cur_a < n_as; ++cur_a) { - const int64_t cne1 = matrix_row_counts[cur_a]; - - if (cne1 == 0) { - continue; - } - - const auto * src0_cur = (const char *) src0->data + cur_a*nb02; - - //const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = cne1; // src1 rows - - int64_t src0_cur_start = (ith * ne01) / nth; - int64_t src0_cur_end = ((ith + 1) * ne01) / nth; - - src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start; - src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end; - - if (src0_cur_start >= src0_cur_end) { - return; - } - - for (int ir1 = 0; ir1 < nr1; ir1++) { - struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); - - const int id = row_mapping.i1; // selected expert index - - const int64_t i11 = id % ne11; - const int64_t i12 = row_mapping.i2; // row index in src1 - - const int64_t i1 = id; // selected expert index - const int64_t i2 = i12; // row - - const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2); - - gemv(ne00, - (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, - src0_cur + src0_cur_start * nb01, - src1_col, 1, src0_cur_end - src0_cur_start); - } - } -#undef MMID_MATRIX_ROW - } - - int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { - GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), - (int) NB_COLS, (int) INTER_SIZE); - return ggml::cpu::aarch64::repack(t, data, data_size); - } -}; - -// instance for Q4 -static const tensor_traits q4_0_4x4_q8_0; -static const tensor_traits q4_0_4x8_q8_0; -static const tensor_traits q4_0_8x8_q8_0; -static const tensor_traits q4_K_8x8_q8_K; - -// instance for IQ4 -static const tensor_traits iq4_nl_4x4_q8_0; - -} // namespace ggml::cpu::aarch64 - -static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { - if (cur->type == GGML_TYPE_Q4_0) { - if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { - if (cur->ne[1] % 8 == 0) { - return &ggml::cpu::aarch64::q4_0_8x8_q8_0; - } - } - if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - if (cur->ne[1] % 4 == 0) { - return &ggml::cpu::aarch64::q4_0_4x8_q8_0; - } - } - if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - if (cur->ne[1] % 4 == 0) { - return &ggml::cpu::aarch64::q4_0_4x4_q8_0; - } - } - } else if (cur->type == GGML_TYPE_Q4_K) { - if (ggml_cpu_has_avx2()) { - if (cur->ne[1] % 8 == 0) { - return &ggml::cpu::aarch64::q4_K_8x8_q8_K; - } - } - } else if (cur->type == GGML_TYPE_IQ4_NL) { - if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - if (cur->ne[1] % 4 == 0) { - return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0; - } - } - } - - return nullptr; -} - -static enum ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - tensor->extra = (void *) const_cast(ggml_aarch64_get_optimal_repack_type(tensor)); - - GGML_UNUSED(buffer); - return GGML_STATUS_SUCCESS; -} - -static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, - const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra; - auto OK = tensor_traits->repack(tensor, data, size); - - GGML_ASSERT(OK == 0); - GGML_UNUSED(buffer); -} - -static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU_AARCH64"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); - - if (buffer == nullptr) { - return nullptr; - } - - buffer->buft = buft; - buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor; - buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; - buffer->iface.get_tensor = nullptr; - buffer->iface.cpy_tensor = nullptr; - return buffer; -} - -static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return TENSOR_ALIGNMENT; - - GGML_UNUSED(buft); -} - -namespace ggml::cpu::aarch64 { -class extra_buffer_type : ggml::cpu::extra_buffer_type { - bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { - if ( op->op == GGML_OP_MUL_MAT && - op->src[0]->buffer && - (ggml_n_dims(op->src[0]) == 2) && - op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() && - ggml_aarch64_get_optimal_repack_type(op->src[0]) - ) { - if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { - return false; - } - if (op->src[1]->type == GGML_TYPE_F32) { - return true; - } - //if (op->src[1]->type == GGML_TYPE_Q8_0) { - // return true; - //} - // may be possible if Q8_0 packed... - } else if (op->op == GGML_OP_MUL_MAT_ID - && op->src[0]->buffer - && (ggml_n_dims(op->src[0]) == 3) - && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() - && ggml_aarch64_get_optimal_repack_type(op->src[0]) - ) { - if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { - return false; - } - if (op->src[1]->type == GGML_TYPE_F32) { - return true; - } - //if (op->src[1]->type == GGML_TYPE_Q8_0) { - // return true; - //} - } - return false; - } - - ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { - if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) { - return (ggml::cpu::tensor_traits *) op->src[0]->extra; - } - } - return nullptr; - } -}; -} // namespace ggml::cpu::aarch64 - -ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment, - /* .get_max_size = */ nullptr, // defaults to SIZE_MAX - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ nullptr, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(), - }; - - return &ggml_backend_cpu_buffer_type_aarch64; -} diff --git a/ggml/src/ggml-cpu/common.h b/ggml/src/ggml-cpu/common.h index 3df01c1edffeb..353563dc35c5d 100644 --- a/ggml/src/ggml-cpu/common.h +++ b/ggml/src/ggml-cpu/common.h @@ -1,9 +1,10 @@ #pragma once #include "ggml.h" -#include "ggml-cpu-traits.h" +#include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "simd-mappings.h" #ifdef __cplusplus @@ -12,11 +13,11 @@ // convenience functions/macros for use in template calls // note: these won't be required after the 'traits' lookup table is used. static inline ggml_fp16_t f32_to_f16(float x) { - return GGML_FP32_TO_FP16(x); + return GGML_CPU_FP32_TO_FP16(x); } static inline float f16_to_f32(ggml_fp16_t x) { - return GGML_FP16_TO_FP32(x); + return GGML_CPU_FP16_TO_FP32(x); } static inline ggml_bf16_t f32_to_bf16(float x) { diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h deleted file mode 100644 index 6e84c826b4091..0000000000000 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +++ /dev/null @@ -1,8 +0,0 @@ -#pragma once - -#include "ggml-cpu-traits.h" -#include "ggml.h" - -// GGML internal header - -ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index b3f1b5ca79092..d839cf5c55e81 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -62,11 +62,17 @@ struct ggml_compute_params { #if defined(__s390x__) && defined(__VEC__) #ifndef __VXE__ #define __VXE__ -#endif +#endif // __VXE__ #ifndef __VXE2__ #define __VXE2__ -#endif -#endif +#endif // __VXE2__ +#endif // __s390x__ && __VEC__ + +#if defined(__s390x__) && defined(GGML_NNPA) +#ifndef __NNPA__ +#define __NNPA__ +#endif // __NNPA__ +#endif // __s390x__ && GGML_NNPA #if defined(__ARM_FEATURE_SVE) #include @@ -371,7 +377,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR #endif -typedef signed char char8x16_t __attribute__((vector_size(16))); +typedef signed char char8x16_t __attribute__((vector_size(16))); typedef unsigned char uchar8x16_t __attribute__((vector_size(16))); typedef int8_t int8x16_t __attribute__((vector_size(16))); @@ -382,10 +388,10 @@ typedef uint8_t uint8x16_t __attribute__((vector_size(16))); typedef uint16_t uint16x8_t __attribute__((vector_size(16))); typedef uint32_t uint32x4_t __attribute__((vector_size(16))); -typedef float float32x4_t __attribute__((vector_size(16))); -typedef double double64x2_t __attribute((vector_size(16))); +typedef float float32x4_t __attribute__((vector_size(16))); +typedef double double64x2_t __attribute__((vector_size(16))); -typedef signed long long long64x2_t __attribute((vector_size(16))); +typedef signed long long long64x2_t __attribute__((vector_size(16))); typedef unsigned long long ulong64x2_t __attribute__((vector_size(16))); typedef struct ggml_uint8x16x2_t { @@ -503,6 +509,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) { // TODO: move to ggml-threading void ggml_barrier(struct ggml_threadpool * tp); +void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value); +int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c deleted file mode 100644 index 40bded4767b47..0000000000000 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ /dev/null @@ -1,13891 +0,0 @@ -#define GGML_COMMON_IMPL_C -#include "ggml-common.h" - -#include "ggml-quants.h" -#include "ggml-cpu-quants.h" -#include "ggml-impl.h" -#include "ggml-cpu-impl.h" -#include "ggml-cpu.h" - -#include -#include -#include -#include -#include // for qsort -#include // for GGML_ASSERT - -#define GROUP_MAX_EPS 1e-15f -#define GROUP_MAX_EPS_IQ3_XXS 1e-8f -#define GROUP_MAX_EPS_IQ2_S 1e-8f -#define GROUP_MAX_EPS_IQ1_M 1e-7f -#define GROUP_MAX_EPS_IQ1_S 1e-12f - -#define UNUSED GGML_UNUSED - -// some compilers don't provide _mm256_set_m128i, e.g. gcc 7 -#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) - -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -// multiply int8_t, add results pairwise twice -static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(x, x); - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(y, x); - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); - const __m128i ones = _mm_set1_epi16(1); - return _mm_madd_epi16(ones, dot); -} - -#if __AVX__ || __AVX2__ || __AVX512F__ -// horizontally add 8 floats -static inline float hsum_float_8(const __m256 x) { - __m128 res = _mm256_extractf128_ps(x, 1); - res = _mm_add_ps(res, _mm256_castps256_ps128(x)); - res = _mm_add_ps(res, _mm_movehl_ps(res, res)); - res = _mm_add_ss(res, _mm_movehdup_ps(res)); - return _mm_cvtss_f32(res); -} - -// horizontally add 8 int32_t -static inline int hsum_i32_8(const __m256i a) { - const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); - const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); - const __m128i sum64 = _mm_add_epi32(hi64, sum128); - const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); - return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); -} - -// horizontally add 4 int32_t -static inline int hsum_i32_4(const __m128i a) { - const __m128i hi64 = _mm_unpackhi_epi64(a, a); - const __m128i sum64 = _mm_add_epi32(hi64, a); - const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); - return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); -} - -#if defined(__AVX2__) || defined(__AVX512F__) -// spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { - uint32_t x32; - memcpy(&x32, x, sizeof(uint32_t)); - const __m256i shuf_mask = _mm256_set_epi64x( - 0x0303030303030303, 0x0202020202020202, - 0x0101010101010101, 0x0000000000000000); - __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); - const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); - bytes = _mm256_or_si256(bytes, bit_mask); - return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); -} - -// Unpack 32 4-bit fields into 32 bytes -// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) -{ - const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); - const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); - const __m256i lowMask = _mm256_set1_epi8( 0xF ); - return _mm256_and_si256(lowMask, bytes); -} - -// add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m256i x) { - const __m256i ones = _mm256_set1_epi16(1); - const __m256i summed_pairs = _mm256_madd_epi16(ones, x); - return _mm256_cvtepi32_ps(summed_pairs); -} - -static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { -#if defined(__AVX512VNNI__) && defined(__AVX512VL__) - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); - return _mm256_cvtepi32_ps(summed_pairs); -#elif defined(__AVXVNNI__) - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy); - return _mm256_cvtepi32_ps(summed_pairs); -#else - // Perform multiplication and create 16-bit values - const __m256i dot = _mm256_maddubs_epi16(ax, sy); - return sum_i16_pairs_float(dot); -#endif -} - -// multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { -#if __AVXVNNIINT8__ - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); - return _mm256_cvtepi32_ps(summed_pairs); -#else - // Get absolute values of x vectors - const __m256i ax = _mm256_sign_epi8(x, x); - // Sign the values of the y vectors - const __m256i sy = _mm256_sign_epi8(y, x); - return mul_sum_us8_pairs_float(ax, sy); -#endif -} - -static inline __m128i packNibbles( __m256i bytes ) -{ - // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh -#if __AVX512F__ - const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 - bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh - return _mm256_cvtepi16_epi8(bytes); // abcd_efgh -#else - const __m256i lowByte = _mm256_set1_epi16( 0xFF ); - __m256i high = _mm256_andnot_si256( lowByte, bytes ); - __m256i low = _mm256_and_si256( lowByte, bytes ); - high = _mm256_srli_epi16( high, 4 ); - bytes = _mm256_or_si256( low, high ); - - // Compress uint16_t lanes into bytes - __m128i r0 = _mm256_castsi256_si128( bytes ); - __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); - return _mm_packus_epi16( r0, r1 ); -#endif -} -#elif defined(__AVX__) -static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) -{ - // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh - const __m128i lowByte = _mm_set1_epi16( 0xFF ); - __m128i high = _mm_andnot_si128( lowByte, bytes1 ); - __m128i low = _mm_and_si128( lowByte, bytes1 ); - high = _mm_srli_epi16( high, 4 ); - bytes1 = _mm_or_si128( low, high ); - high = _mm_andnot_si128( lowByte, bytes2 ); - low = _mm_and_si128( lowByte, bytes2 ); - high = _mm_srli_epi16( high, 4 ); - bytes2 = _mm_or_si128( low, high ); - - return _mm_packus_epi16( bytes1, bytes2); -} - -static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { - const __m128i ax = _mm_sign_epi8(x, x); - const __m128i sy = _mm_sign_epi8(y, x); - return _mm_maddubs_epi16(ax, sy); -} - -// spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { - uint32_t x32; - memcpy(&x32, x, sizeof(uint32_t)); - const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); - const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); - __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); - __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); - const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); - bytesl = _mm_or_si128(bytesl, bit_mask); - bytesh = _mm_or_si128(bytesh, bit_mask); - bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); - bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); - return MM256_SET_M128I(bytesh, bytesl); -} - -// Unpack 32 4-bit fields into 32 bytes -// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) -{ - // Load 16 bytes from memory - __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); - __m128i tmph = _mm_srli_epi16(tmpl, 4); - const __m128i lowMask = _mm_set1_epi8(0xF); - tmpl = _mm_and_si128(lowMask, tmpl); - tmph = _mm_and_si128(lowMask, tmph); - return MM256_SET_M128I(tmph, tmpl); -} - -// add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { - const __m128i ones = _mm_set1_epi16(1); - const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); - const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); - const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl); - return _mm256_cvtepi32_ps(summed_pairs); -} - -static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { - const __m128i axl = _mm256_castsi256_si128(ax); - const __m128i axh = _mm256_extractf128_si256(ax, 1); - const __m128i syl = _mm256_castsi256_si128(sy); - const __m128i syh = _mm256_extractf128_si256(sy, 1); - // Perform multiplication and create 16-bit values - const __m128i dotl = _mm_maddubs_epi16(axl, syl); - const __m128i doth = _mm_maddubs_epi16(axh, syh); - return sum_i16_pairs_float(doth, dotl); -} - -// multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { - const __m128i xl = _mm256_castsi256_si128(x); - const __m128i xh = _mm256_extractf128_si256(x, 1); - const __m128i yl = _mm256_castsi256_si128(y); - const __m128i yh = _mm256_extractf128_si256(y, 1); - // Get absolute values of x vectors - const __m128i axl = _mm_sign_epi8(xl, xl); - const __m128i axh = _mm_sign_epi8(xh, xh); - // Sign the values of the y vectors - const __m128i syl = _mm_sign_epi8(yl, xl); - const __m128i syh = _mm_sign_epi8(yh, xh); - // Perform multiplication and create 16-bit values - const __m128i dotl = _mm_maddubs_epi16(axl, syl); - const __m128i doth = _mm_maddubs_epi16(axh, syh); - return sum_i16_pairs_float(doth, dotl); -} - -// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors -static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1, - const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) { - const __m128i mone = _mm_set1_epi16(1); - - const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0); - const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1); - const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0); - const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1); - const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone); - const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone); - const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone); - const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone); - const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1); - const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1); - return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1)); -} - -// quad fp16 delta calculation -static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) { - // GGML_FP16_TO_FP32 is faster than Intel F16C - return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)), - _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0))); -} -#endif -#elif defined(__SSSE3__) -// horizontally add 4x4 floats -static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { - __m128 res_0 =_mm_hadd_ps(a, b); - __m128 res_1 =_mm_hadd_ps(c, d); - __m128 res =_mm_hadd_ps(res_0, res_1); - res =_mm_hadd_ps(res, res); - res =_mm_hadd_ps(res, res); - - return _mm_cvtss_f32(res); -} -#endif // __AVX__ || __AVX2__ || __AVX512F__ -#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) - -#if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) -#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s -#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) -#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) -#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) -#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) -#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) -#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) -#define B8(c,s ) B7(c,s, c), B7(c,s, s) - -// precomputed tables for expanding 8bits to 8 bytes: -static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 -static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 -#endif - -#if defined(__loongarch_sx) - -static __m128i lsx_packs_w(__m128i a, __m128i b) { - __m128i tmp, tmp1; - tmp = __lsx_vsat_w(a, 15); - tmp1 = __lsx_vsat_w(b, 15); - return __lsx_vpickev_h(tmp1, tmp); -} - -static __m128i lsx_packs_h(__m128i a, __m128i b) { - __m128i tmp, tmp1; - tmp = __lsx_vsat_h(a, 7); - tmp1 = __lsx_vsat_h(b, 7); - return __lsx_vpickev_b(tmp1, tmp); -} - -static __m128i lsx_packus_h(__m128i a, __m128i b) { - __m128i tmp, tmp1; - tmp = __lsx_vsat_hu(a, 7); - tmp1 = __lsx_vsat_hu(b, 7); - return __lsx_vpickev_b(tmp1, tmp); -} - -static __m128i lsx_maddubs_h(__m128i a, __m128i b) { - __m128i tmp1, tmp2; - tmp1 = __lsx_vmulwev_h_b(a, b); - tmp2 = __lsx_vmulwod_h_b(a, b); - return __lsx_vsadd_h(tmp1, tmp2); -} - -static __m128i lsx_madd_h(__m128i a, __m128i b) { - __m128i tmp1, tmp2; - tmp1 = __lsx_vmulwev_w_h(a, b); - tmp2 = __lsx_vmulwod_w_h(a, b); - return __lsx_vadd_w(tmp1, tmp2); -} - -static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) { - v4i32 __ret = {d, c, b, a}; - return (__m128i)__ret; -} - -static __m128i lsx_shuffle_b(__m128i a, __m128i b) { - __m128i mask_f, zero, tmp0, tmp2, mask; - int f = 0x8f; - mask_f = __lsx_vreplgr2vr_b(f); - zero = __lsx_vldi(0); - tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits - tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive - mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask - tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones - return __lsx_vshuf_b(a, zero, tmp2); -} - -static __m128i lsx_hadd_h(__m128i a, __m128i b) { - __m128i tmp1 = __lsx_vpickev_h(b, a); - __m128i tmp2 = __lsx_vpickod_h(b, a); - return __lsx_vadd_h(tmp1, tmp2); -} - -static __m128i lsx_hadd_w(__m128i a, __m128i b) { - __m128i tmp1 = __lsx_vpickev_w(b, a); - __m128i tmp2 = __lsx_vpickod_w(b, a); - return __lsx_vadd_w(tmp1, tmp2); -} - -static __m128 lsx_hadd_s(__m128 a, __m128 b) { - __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a); - __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a); - - return __lsx_vfadd_s(tmp1, tmp2); -} - -static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { - __m128 res_0 =lsx_hadd_s(a, b); - __m128 res_1 =lsx_hadd_s(c, d); - __m128 res =lsx_hadd_s(res_0, res_1); - res =lsx_hadd_s(res, res); - res =lsx_hadd_s(res, res); - - return ((v4f32)res)[0]; -} -#endif - -#if defined(__loongarch_asx) - -#ifdef __clang__ -#define VREGS_PREFIX "$vr" -#define XREGS_PREFIX "$xr" -#else // GCC -#define VREGS_PREFIX "$f" -#define XREGS_PREFIX "$f" -#endif -#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31" -// Convert __m128i to __m256i -static inline __m256i ____m256i(__m128i in) { - __m256i out = __lasx_xvldi(0); - __asm__ volatile ( - ".irp i," __ALL_REGS "\n\t" - " .ifc %[out], " XREGS_PREFIX"\\i \n\t" - " .irp j," __ALL_REGS "\n\t" - " .ifc %[in], " VREGS_PREFIX "\\j \n\t" - " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" - " .endif \n\t" - " .endr \n\t" - " .endif \n\t" - ".endr \n\t" - : [out] "+f" (out) : [in] "f" (in) - ); - return out; -} -// Convert two __m128i to __m256i -static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) { - __m256i out; - __asm__ volatile ( - ".irp i," __ALL_REGS "\n\t" - " .ifc %[hi], " VREGS_PREFIX "\\i \n\t" - " .irp j," __ALL_REGS "\n\t" - " .ifc %[lo], " VREGS_PREFIX "\\j \n\t" - " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" - " .endif \n\t" - " .endr \n\t" - " .endif \n\t" - ".endr \n\t" - ".ifnc %[out], %[hi] \n\t" - ".irp i," __ALL_REGS "\n\t" - " .ifc %[out], " XREGS_PREFIX "\\i \n\t" - " .irp j," __ALL_REGS "\n\t" - " .ifc %[hi], " VREGS_PREFIX "\\j \n\t" - " xvori.b $xr\\i, $xr\\j, 0 \n\t" - " .endif \n\t" - " .endr \n\t" - " .endif \n\t" - ".endr \n\t" - ".endif \n\t" - : [out] "=f" (out), [hi] "+f" (inhi) - : [lo] "f" (inlo) - ); - return out; -} -// Convert __m256i low part to __m128i -static inline __m128i lasx_extracti128_lo(__m256i in) { - __m128i out; - __asm__ volatile ( - ".ifnc %[out], %[in] \n\t" - ".irp i," __ALL_REGS "\n\t" - " .ifc %[out], " VREGS_PREFIX "\\i \n\t" - " .irp j," __ALL_REGS "\n\t" - " .ifc %[in], " XREGS_PREFIX "\\j \n\t" - " vori.b $vr\\i, $vr\\j, 0 \n\t" - " .endif \n\t" - " .endr \n\t" - " .endif \n\t" - ".endr \n\t" - ".endif \n\t" - : [out] "=f" (out) : [in] "f" (in) - ); - return out; -} -// Convert __m256i high part to __m128i -static inline __m128i lasx_extracti128_hi(__m256i in) { - __m128i out; - __asm__ volatile ( - ".irp i," __ALL_REGS "\n\t" - " .ifc %[out], " VREGS_PREFIX "\\i \n\t" - " .irp j," __ALL_REGS "\n\t" - " .ifc %[in], " XREGS_PREFIX "\\j \n\t" - " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t" - " .endif \n\t" - " .endr \n\t" - " .endif \n\t" - ".endr \n\t" - : [out] "=f" (out) : [in] "f" (in) - ); - return out; -} - -static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) { - v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7}; - return (__m256i)__ret; -} - -static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) { - v4i64 __ret = {d, c, b, a}; - return (__m256i)__ret; -} - -static __m256i lasx_insertf128( __m128i x, __m128i y) { - return lasx_set_q(x, y); -} - -static __m256i lasx_shuffle_b(__m256i a, __m256i b) { - __m256i mask_f, zero, tmp0, tmp2, mask; - int f = 0x8f; - mask_f = __lasx_xvreplgr2vr_b(f); - zero = __lasx_xvldi(0); - tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits - tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive - mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask - tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones - return __lasx_xvshuf_b(a, zero, tmp2); -} - -static __m256i lasx_extu8_16(__m128i a) { - return __lasx_vext2xv_hu_bu(____m256i(a)); -} - -static __m256i lasx_ext8_16(__m128i a) { - return __lasx_vext2xv_h_b(____m256i(a)); -} - -static __m256i lasx_ext16_32(__m128i a) { - return __lasx_vext2xv_w_h(____m256i(a)); -} - -static __m128i lasx_extracti128( __m256i a, int pos) { - __m128i ret; - if( pos == 0) - { - ret = lasx_extracti128_lo(a); - } else { - ret = lasx_extracti128_hi(a); - } - return ret; -} - -static __m128 lasx_extractf128( __m256 a, int pos) { - __m128 ret; - if( pos == 0) - { - ret = (__m128)lasx_extracti128_lo((__m256i)a); - } else { - ret = (__m128)lasx_extracti128_hi((__m256i)a); - } - return ret; -} - -static __m256i lasx_maddubs_h(__m256i a, __m256i b) { - __m256i tmp1, tmp2; - tmp1 = __lasx_xvmulwev_h_b(a, b); - tmp2 = __lasx_xvmulwod_h_b(a, b); - return __lasx_xvsadd_h(tmp1, tmp2); -} - -static __m256i lasx_madd_h(__m256i a, __m256i b) { - __m256i tmp1, tmp2; - tmp1 = __lasx_xvmulwev_w_h(a, b); - tmp2 = __lasx_xvmulwod_w_h(a, b); - return __lasx_xvadd_w(tmp1, tmp2); -} - -static __m256i lasx_packs_w(__m256i a, __m256i b) { - __m256i tmp, tmp1; - tmp = __lasx_xvsat_w(a, 15); - tmp1 = __lasx_xvsat_w(b, 15); - return __lasx_xvpickev_h(tmp1, tmp); -} - -static __m256i lasx_packs_h(__m256i a, __m256i b) { - __m256i tmp, tmp1; - tmp = __lasx_xvsat_h(a, 7); - tmp1 = __lasx_xvsat_h(b, 7); - return __lasx_xvpickev_b(tmp1, tmp); -} - -static inline __m256i lasx_madd_h_b(__m256i a, __m256i b) { - __m256i tmp1, tmp2; - tmp1 = __lasx_xvmulwev_h_b(a, b); - tmp2 = __lasx_xvmulwod_h_b(a, b); - return __lasx_xvadd_h(tmp1, tmp2); -} - -static inline __m256i lasx_xvrepl128vei_h(__m256i a, const unsigned int b) { - switch (b) { - case 0: return __lasx_xvrepl128vei_h(a, 0); - case 1: return __lasx_xvrepl128vei_h(a, 1); - case 2: return __lasx_xvrepl128vei_h(a, 2); - case 3: return __lasx_xvrepl128vei_h(a, 3); - case 4: return __lasx_xvrepl128vei_h(a, 4); - case 5: return __lasx_xvrepl128vei_h(a, 5); - case 6: return __lasx_xvrepl128vei_h(a, 6); - case 7: return __lasx_xvrepl128vei_h(a, 7); - default: __builtin_unreachable(); - } -} - -static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) { - switch (b) { - case 0: return __lasx_xvandi_b(a, 1 << 0); - case 1: return __lasx_xvandi_b(a, 1 << 1); - case 2: return __lasx_xvandi_b(a, 1 << 2); - case 3: return __lasx_xvandi_b(a, 1 << 3); - case 4: return __lasx_xvandi_b(a, 1 << 4); - case 5: return __lasx_xvandi_b(a, 1 << 5); - case 6: return __lasx_xvandi_b(a, 1 << 6); - case 7: return __lasx_xvandi_b(a, 1 << 7); - default: __builtin_unreachable(); - } -} - -// multiply int8_t, add results pairwise twice -static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { - // Get absolute values of x vectors - const __m128i ax = __lsx_vsigncov_b(x, x); - // Sign the values of the y vectors - const __m128i sy = __lsx_vsigncov_b(x, y); - // Perform multiplication and create 16-bit values - const __m128i dot = lsx_maddubs_h(ax, sy); - const __m128i ones = __lsx_vreplgr2vr_h(1); - return lsx_madd_h(ones, dot); -} - -// horizontally add 8 floats -static inline float hsum_float_8(const __m256 x) { - __m128 res = lasx_extractf128(x, 1); - res = __lsx_vfadd_s(res, lasx_extractf128(x, 0)); - res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res)); - res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0)); - return ((v4f32)res)[0]; -} - -// horizontally add 8 int32_t -static inline int hsum_i32_8(const __m256i a) { - - __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11); - __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00); - - __m128i tmp1_128 = lasx_extracti128_lo(tmp1); - __m128i tmp2_128 = lasx_extracti128_lo(tmp2); - - __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128); - - __m128i ev = __lsx_vpickev_w(sum128, sum128); - __m128i od = __lsx_vpickod_w(sum128, sum128); - __m128i sum64 = __lsx_vadd_w(ev, od); - - int sum64_1, sum64_2; - sum64_1 = __lsx_vpickve2gr_w(sum64, 0); - sum64_2 = __lsx_vpickve2gr_w(sum64, 1); - - return sum64_1 + sum64_2; -} - -// horizontally add 4 int32_t -static inline int hsum_i32_4(const __m128i a) { - __m128i ev = __lsx_vpickev_w(a, a); - __m128i od = __lsx_vpickod_w(a, a); - __m128i sum64 = __lsx_vadd_w(ev, od); - - int sum64_1, sum64_2; - sum64_1 = __lsx_vpickve2gr_w(sum64, 0); - sum64_2 = __lsx_vpickve2gr_w(sum64, 1); - - return sum64_1 + sum64_2; -} - -// spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { - - uint32_t x32; - memcpy(&x32, x, sizeof(uint32_t)); - const __m256i shuf_mask = lasx_set_d( - 0x0303030303030303, 0x0202020202020202, - 0x0101010101010101, 0x0000000000000000); - - __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask); - const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe); - bytes = __lasx_xvor_v(bytes, bit_mask); - return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1)); -} - -// Unpack 32 4-bit fields into 32 bytes -// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { - const __m128i lo = __lsx_vld((const __m128i *)rsi, 0); - __m128i hi = __lsx_vsrli_h(lo, 4); - return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf); -} - -// add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m256i x) { - __m256i v = __lasx_xvpackod_h(x, x); - __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v); - return __lasx_xvffint_s_w(summed_pairs); -} - -static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { - // Perform multiplication and create 16-bit values - const __m256i dot = lasx_maddubs_h(ax, sy); - return sum_i16_pairs_float(dot); -} - -// multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { - const __m256i dot = lasx_madd_h_b(x, y); - return sum_i16_pairs_float(dot); -} - -static inline __m128i packNibbles( __m256i bytes ) { - // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh - const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF); - __m256i high = __lasx_xvandn_v(lowByte, bytes); - __m256i low = __lasx_xvand_v(lowByte, bytes); - high = __lasx_xvsrli_h(high, 4); - bytes = __lasx_xvor_v(low, high); - // Compress uint16_t lanes into bytes - __m128i *r0 = (__m128i *)&bytes; - __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11); - __m128i *r1 = (__m128i *)&tmp_h128; - - __m128i zero = __lsx_vldi(0); - __m128i tmp, tmp2, tmp3; - - tmp = __lsx_vmax_h(zero, *r0); - tmp2 = __lsx_vsat_hu(tmp, 7); - - tmp = __lsx_vmax_h(zero, *r1); - tmp3 = __lsx_vsat_hu(tmp, 7); - return __lsx_vpickev_b(tmp3, tmp2); -} -#endif //__loongarch_asx - -void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { - quantize_row_q4_0_ref(x, y, k); -} - -void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { - quantize_row_q4_1_ref(x, y, k); -} - -void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { - quantize_row_q5_0_ref(x, y, k); -} - -void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { - quantize_row_q5_1_ref(x, y, k); -} - -void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(QK8_0 == 32); - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; - - block_q8_0 * GGML_RESTRICT y = vy; - -#if defined(__ARM_NEON) - for (int i = 0; i < nb; i++) { - float32x4_t srcv [8]; - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); - - const float amax = vmaxvq_f32(amaxv[0]); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - for (int j = 0; j < 8; j++) { - const float32x4_t v = vmulq_n_f32(srcv[j], id); - const int32x4_t vi = vcvtnq_s32_f32(v); - - y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); - } - } -#elif defined __wasm_simd128__ - for (int i = 0; i < nb; i++) { - v128_t srcv [8]; - v128_t asrcv[8]; - v128_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), - wasm_f32x4_extract_lane(amaxv[0], 1)), - MAX(wasm_f32x4_extract_lane(amaxv[0], 2), - wasm_f32x4_extract_lane(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - for (int j = 0; j < 8; j++) { - const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); - const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); - - y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); - y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); - y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); - y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); - } - } -#elif defined(__AVX2__) || defined(__AVX__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max(abs(e)) for the block - const __m256 signBit = _mm256_set1_ps( -0.0f ); - __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); - - // Quantize these floats - const float d = maxScalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - -#if defined(__AVX2__) - // Convert int32 to int16 - i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 - i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 - // Convert int16 to int8 - i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 - - // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); - - _mm256_storeu_si256((__m256i *)y[i].qs, i0); -#else - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); - _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); -#endif - } -#elif defined(__riscv_v) - - size_t vl = QK8_0; - - for (int i = 0; i < nb; i++) { - // load elements - vfloat32m8_t v_x = __riscv_vle32_v_f32m8(x+i*QK8_0, vl); - - vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl); - vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl); - vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl); - float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl); - - // convert to integer - vint16m4_t vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl); - vint8m2_t vs = __riscv_vncvt_x_x_w_i8m2(vi, vl); - - // store result - __riscv_vse8_v_i8m2(y[i].qs , vs, vl); - } - -#elif defined(__POWER9_VECTOR__) - for (int i = 0; i < nb; i++) { - vector float srcv [8]; - vector float asrcv[8]; - vector float amaxv[8]; - vector signed int vi[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(vec_extract(amaxv[0], 0), - vec_extract(amaxv[0], 1)), - MAX(vec_extract(amaxv[0], 2), - vec_extract(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - const vector float vid = vec_splats(id); - - y[i].d = GGML_FP32_TO_FP16(d); - - for (int j = 0; j < 8; j++) { - const vector float v = vec_round(vec_mul(srcv[j], vid)); - vi[j] = vec_cts(v, 0); - } - vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]); - vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]); - } - -#elif defined(__loongarch_asx) - for (int i = 0; i < nb; i++) { - __m256 v0 = (__m256)__lasx_xvld( x , 0); - __m256 v1 = (__m256)__lasx_xvld( x , 32); - __m256 v2 = (__m256)__lasx_xvld( x , 64); - __m256 v3 = (__m256)__lasx_xvld( x , 96); - x += 32; - - // Compute max(abs(e)) for the block - const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f ); - __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 ); - max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) ); - max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) ); - max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) ); - - __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) ); - max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); - __m128 tmp = max4; - max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 )); - const float max_scalar = ((v4f32)max4)[0]; - - // Quantize these floats - const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); - const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; - const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id ); - - // Apply the multiplier - v0 = __lasx_xvfmul_s( v0, mul ); - v1 = __lasx_xvfmul_s( v1, mul ); - v2 = __lasx_xvfmul_s( v2, mul ); - v3 = __lasx_xvfmul_s( v3, mul ); - - // Round to nearest integer - __m256i i0 = __lasx_xvftintrne_w_s( v0 ); - __m256i i1 = __lasx_xvftintrne_w_s( v1 ); - __m256i i2 = __lasx_xvftintrne_w_s( v2 ); - __m256i i3 = __lasx_xvftintrne_w_s( v3 ); - - __m128i ni0 = lasx_extracti128( i0, 0 ); - __m128i ni1 = lasx_extracti128( i0, 1); - __m128i ni2 = lasx_extracti128( i1, 0); - __m128i ni3 = lasx_extracti128( i1, 1); - __m128i ni4 = lasx_extracti128( i2, 0); - __m128i ni5 = lasx_extracti128( i2, 1); - __m128i ni6 = lasx_extracti128( i3, 0); - __m128i ni7 = lasx_extracti128( i3, 1); - - // Convert int32 to int16 - ni0 = lsx_packs_w( ni0, ni1 ); - ni2 = lsx_packs_w( ni2, ni3 ); - ni4 = lsx_packs_w( ni4, ni5 ); - ni6 = lsx_packs_w( ni6, ni7 ); - // Convert int16 to int8 - ni0 = lsx_packs_h( ni0, ni2 ); - ni4 = lsx_packs_h( ni4, ni6 ); - - __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0); - __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0); - - } -#elif defined(__VXE__) || defined(__VXE2__) - for (int i = 0; i < nb; i++) { - __vector float srcv [8]; - __vector float asrcv[8]; - __vector float amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); - for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(vec_extract(amaxv[0], 0), - vec_extract(amaxv[0], 1)), - MAX(vec_extract(amaxv[0], 2), - vec_extract(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f / d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - for (int j = 0; j < 8; j++) { - const __vector float v = vec_mul(srcv[j], vec_splats(id)); - const __vector int32_t vi = vec_signed(v); - - y[i].qs[4*j + 0] = vec_extract(vi, 0); - y[i].qs[4*j + 1] = vec_extract(vi, 1); - y[i].qs[4*j + 2] = vec_extract(vi, 2); - y[i].qs[4*j + 3] = vec_extract(vi, 3); - } - } -#else - GGML_UNUSED(nb); - // scalar - quantize_row_q8_0_ref(x, y, k); -#endif -} - -void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(k % QK8_1 == 0); - const int nb = k / QK8_1; - - block_q8_1 * GGML_RESTRICT y = vy; - -#if defined(__ARM_NEON) - for (int i = 0; i < nb; i++) { - float32x4_t srcv [8]; - float32x4_t asrcv[8]; - float32x4_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); - - const float amax = vmaxvq_f32(amaxv[0]); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - int32x4_t accv = vdupq_n_s32(0); - - for (int j = 0; j < 8; j++) { - const float32x4_t v = vmulq_n_f32(srcv[j], id); - const int32x4_t vi = vcvtnq_s32_f32(v); - - y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); - - accv = vaddq_s32(accv, vi); - } - - y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); - } -#elif defined __wasm_simd128__ - for (int i = 0; i < nb; i++) { - v128_t srcv [8]; - v128_t asrcv[8]; - v128_t amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), - wasm_f32x4_extract_lane(amaxv[0], 1)), - MAX(wasm_f32x4_extract_lane(amaxv[0], 2), - wasm_f32x4_extract_lane(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - v128_t accv = wasm_i32x4_splat(0); - - for (int j = 0; j < 8; j++) { - const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); - const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); - - y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); - y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); - y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); - y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); - - accv = wasm_i32x4_add(accv, vi); - } - - y[i].s = GGML_FP32_TO_FP16( - d * (wasm_i32x4_extract_lane(accv, 0) + - wasm_i32x4_extract_lane(accv, 1) + - wasm_i32x4_extract_lane(accv, 2) + - wasm_i32x4_extract_lane(accv, 3))); - } -#elif defined(__AVX2__) || defined(__AVX__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max(abs(e)) for the block - const __m256 signBit = _mm256_set1_ps( -0.0f ); - __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); - maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float max_scalar = _mm_cvtss_f32( max4 ); - - // Quantize these floats - const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); - const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - -#if defined(__AVX2__) - // Compute the sum of the quants and set y[i].s - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); - - // Convert int32 to int16 - i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 - i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 - // Convert int16 to int8 - i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 - - // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); - - _mm256_storeu_si256((__m256i *)y[i].qs, i0); -#else - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1); - - // Compute the sum of the quants and set y[i].s - const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); - const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); - _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); -#endif - } -#elif defined(__riscv_v) - - size_t vl = QK8_1; - - for (int i = 0; i < nb; i++) { - // load elements - vfloat32m8_t v_x = __riscv_vle32_v_f32m8(x+i*QK8_1, vl); - - vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl); - vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl); - vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl); - float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl); - - // convert to integer - vint16m4_t vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl); - vint8m2_t vs = __riscv_vncvt_x_x_w_i8m2(vi, vl); - - // store result - __riscv_vse8_v_i8m2(y[i].qs , vs, vl); - - // compute sum for y[i].s - vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl); - vint16m1_t vwrs = __riscv_vwredsum_vs_i8m2_i16m1(vs, tmp2, vl); - - // set y[i].s - int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); - y[i].s = GGML_FP32_TO_FP16(sum*d); - } - -#elif defined(__POWER9_VECTOR__) - for (int i = 0; i < nb; i++) { - vector float srcv [8]; - vector float asrcv[8]; - vector float amaxv[8]; - vector signed int vi[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); - - for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(vec_extract(amaxv[0], 0), - vec_extract(amaxv[0], 1)), - MAX(vec_extract(amaxv[0], 2), - vec_extract(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - const vector float vid = vec_splats(id); - - y[i].d = GGML_FP32_TO_FP16(d); - - vector int accv = vec_splats(0); - - for (int j = 0; j < 8; j++) { - const vector float v = vec_round(vec_mul(srcv[j], vid)); - vi[j] = vec_cts(v, 0); - - accv = vec_add(accv, vi[j]); - } - vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]); - vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]); - - accv = vec_add(accv, vec_sld(accv, accv, 4)); - accv = vec_add(accv, vec_sld(accv, accv, 8)); - y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0)); - } - -#elif defined(__loongarch_asx) - for (int i = 0; i < nb; i++) { - __m256 v0 = (__m256)__lasx_xvld( x , 0 ); - __m256 v1 = (__m256)__lasx_xvld( x , 32 ); - __m256 v2 = (__m256)__lasx_xvld( x , 64 ); - __m256 v3 = (__m256)__lasx_xvld( x , 96 ); - x += 32; - - // Compute max(abs(e)) for the block - const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f ); - __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 ); - max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) ); - max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) ); - max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) ); - - __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) ); - max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); - __m128 tmp = max4; - max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 )); - const float max_scalar = ((v4f32)max4)[0]; - - // Quantize these floats - const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); - const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; - const __m256 mul = __lasx_xvreplfr2vr_s( id ); - - // Apply the multiplier - v0 = __lasx_xvfmul_s( v0, mul ); - v1 = __lasx_xvfmul_s( v1, mul ); - v2 = __lasx_xvfmul_s( v2, mul ); - v3 = __lasx_xvfmul_s( v3, mul ); - - // Round to nearest integer - __m256i i0 = __lasx_xvftintrne_w_s( v0 ); - __m256i i1 = __lasx_xvftintrne_w_s( v1 ); - __m256i i2 = __lasx_xvftintrne_w_s( v2 ); - __m256i i3 = __lasx_xvftintrne_w_s( v3 ); - - __m128i ni0 = lasx_extracti128(i0, 0); - __m128i ni1 = lasx_extracti128( i0, 1); - __m128i ni2 = lasx_extracti128( i1, 0); - __m128i ni3 = lasx_extracti128( i1, 1); - __m128i ni4 = lasx_extracti128( i2, 0 ); - __m128i ni5 = lasx_extracti128( i2, 1); - __m128i ni6 = lasx_extracti128( i3, 0); - __m128i ni7 = lasx_extracti128( i3, 1); - - // Compute the sum of the quants and set y[i].s - const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3)); - const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7)); - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1))); - - // Convert int32 to int16 - ni0 = lsx_packs_w( ni0, ni1 ); - ni2 = lsx_packs_w( ni2, ni3 ); - ni4 = lsx_packs_w( ni4, ni5 ); - ni6 = lsx_packs_w( ni6, ni7 ); - // Convert int16 to int8 - ni0 = lsx_packs_h( ni0, ni2 ); - ni4 = lsx_packs_h( ni4, ni6 ); - - __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0); - __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0); - } -#elif defined(__VXE__) || defined(__VXE2__) - for (int i = 0; i < nb; i++) { - __vector float srcv [8]; - __vector float asrcv[8]; - __vector float amaxv[8]; - - for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); - for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); - for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); - for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); - for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); - - const float amax = MAX(MAX(vec_extract(amaxv[0], 0), - vec_extract(amaxv[0], 1)), - MAX(vec_extract(amaxv[0], 2), - vec_extract(amaxv[0], 3))); - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f / d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - __vector int32_t acc = vec_splats(0); - - for (int j = 0; j < 8; j++) { - const __vector float v = vec_mul(srcv[j], vec_splats(id)); - const __vector int32_t vi = vec_signed(v); - - y[i].qs[4*j + 0] = vec_extract(vi, 0); - y[i].qs[4*j + 1] = vec_extract(vi, 1); - y[i].qs[4*j + 2] = vec_extract(vi, 2); - y[i].qs[4*j + 3] = vec_extract(vi, 3); - - acc = vec_add(acc, vi); - } - - y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3])); - } -#else - GGML_UNUSED(nb); - // scalar - quantize_row_q8_1_ref(x, y, k); -#endif -} - -// -// 2-6 bit quantization in super-blocks -// - -// -// ===================== Helper functions -// -static inline int nearest_int(float fval) { - assert(fabsf(fval) <= 4194303.f); - float val = fval + 12582912.f; - int i; memcpy(&i, &val, sizeof(int)); - return (i & 0x007fffff) - 0x00400000; -} - -static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type, - const float * GGML_RESTRICT qw) { - float max = 0; - float amax = 0; - for (int i = 0; i < n; ++i) { - float ax = fabsf(x[i]); - if (ax > amax) { amax = ax; max = x[i]; } - } - if (amax < GROUP_MAX_EPS) { // all zero - for (int i = 0; i < n; ++i) { - L[i] = 0; - } - return 0.f; - } - float iscale = -nmax / max; - if (rmse_type == 0) { - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); - } - return 1/iscale; - } - bool return_early = false; - if (rmse_type < 0) { - rmse_type = -rmse_type; - return_early = true; - } - float sumlx = 0; - float suml2 = 0; -#ifdef HAVE_BUGGY_APPLE_LINKER - // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 - for (volatile int i = 0; i < n; ++i) { -#else - for (int i = 0; i < n; ++i) { -#endif - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - L[i] = l + nmax; - float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); - sumlx += w*x[i]*l; - suml2 += w*l*l; - } - float scale = suml2 ? sumlx/suml2 : 0.0f; - if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; - float best = scale * sumlx; - for (int is = -9; is <= 9; ++is) { - if (is == 0) { - continue; - } - iscale = -(nmax + 0.1f*is) / max; - sumlx = suml2 = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); - sumlx += w*x[i]*l; - suml2 += w*l*l; - } - if (suml2 > 0 && sumlx*sumlx > best*suml2) { - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); - } - scale = sumlx/suml2; best = scale*sumlx; - } - } - return scale; -} - -static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) { - float max = 0; - float amax = 0; - for (int i = 0; i < n; ++i) { - float ax = fabsf(x[i]); - if (ax > amax) { amax = ax; max = x[i]; } - } - if (amax < GROUP_MAX_EPS) { // all zero - for (int i = 0; i < n; ++i) { L[i] = 0; } - return 0.f; - } - float iscale = -nmax / max; - if (do_rmse) { - float sumlx = 0; - float suml2 = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - L[i] = l; - float w = x[i]*x[i]; - sumlx += w*x[i]*l; - suml2 += w*l*l; - } - for (int itry = 0; itry < 5; ++itry) { - int n_changed = 0; - for (int i = 0; i < n; ++i) { - float w = x[i]*x[i]; - float slx = sumlx - w*x[i]*L[i]; - if (slx > 0) { - float sl2 = suml2 - w*L[i]*L[i]; - int new_l = nearest_int(x[i] * sl2 / slx); - new_l = MAX(-nmax, MIN(nmax-1, new_l)); - if (new_l != L[i]) { - slx += w*x[i]*new_l; - sl2 += w*new_l*new_l; - if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) { - L[i] = new_l; sumlx = slx; suml2 = sl2; - ++n_changed; - } - } - } - } - if (!n_changed) { - break; - } - } - for (int i = 0; i < n; ++i) { - L[i] += nmax; - } - return sumlx / suml2; - } - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - L[i] = l + nmax; - } - return 1/iscale; -} - -static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, - int ntry, float alpha) { - float min = x[0]; - float max = x[0]; - for (int i = 1; i < n; ++i) { - if (x[i] < min) min = x[i]; - if (x[i] > max) max = x[i]; - } - if (max == min) { - for (int i = 0; i < n; ++i) L[i] = 0; - *the_min = 0; - return 0.f; - } - if (min > 0) min = 0; - float iscale = nmax/(max - min); - float scale = 1/iscale; - for (int itry = 0; itry < ntry; ++itry) { - float sumlx = 0; int suml2 = 0; - bool did_change = false; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - l = MAX(0, MIN(nmax, l)); - if (l != L[i]) { - L[i] = l; - did_change = true; - } - sumlx += (x[i] - min)*l; - suml2 += l*l; - } - scale = sumlx/suml2; - float sum = 0; - for (int i = 0; i < n; ++i) { - sum += x[i] - scale*L[i]; - } - min = alpha*min + (1 - alpha)*sum/n; - if (min > 0) min = 0; - iscale = 1/scale; - if (!did_change) break; - } - *the_min = -min; - return scale; -} - -static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights, - uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux, - float rmin, float rdelta, int nstep, bool use_mad) { - float min = x[0]; - float max = x[0]; - float sum_w = weights[0]; - float sum_x = sum_w * x[0]; -#ifdef HAVE_BUGGY_APPLE_LINKER - // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 - for (volatile int i = 1; i < n; ++i) { -#else - for (int i = 1; i < n; ++i) { -#endif - if (x[i] < min) min = x[i]; - if (x[i] > max) max = x[i]; - float w = weights[i]; - sum_w += w; - sum_x += w * x[i]; - } - if (min > 0) min = 0; - if (max == min) { - for (int i = 0; i < n; ++i) L[i] = 0; - *the_min = -min; - return 0.f; - } - float iscale = nmax/(max - min); - float scale = 1/iscale; - float best_mad = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - L[i] = MAX(0, MIN(nmax, l)); - float diff = scale * L[i] + min - x[i]; - diff = use_mad ? fabsf(diff) : diff * diff; - float w = weights[i]; - best_mad += w * diff; - } - if (nstep < 1) { - *the_min = -min; - return scale; - } - for (int is = 0; is <= nstep; ++is) { - iscale = (rmin + rdelta*is + nmax)/(max - min); - float sum_l = 0, sum_l2 = 0, sum_xl = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - l = MAX(0, MIN(nmax, l)); - Laux[i] = l; - float w = weights[i]; - sum_l += w*l; - sum_l2 += w*l*l; - sum_xl += w*l*x[i]; - } - float D = sum_w * sum_l2 - sum_l * sum_l; - if (D > 0) { - float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; - float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; - if (this_min > 0) { - this_min = 0; - this_scale = sum_xl / sum_l2; - } - float mad = 0; - for (int i = 0; i < n; ++i) { - float diff = this_scale * Laux[i] + this_min - x[i]; - diff = use_mad ? fabsf(diff) : diff * diff; - float w = weights[i]; - mad += w * diff; - } - if (mad < best_mad) { - for (int i = 0; i < n; ++i) { - L[i] = Laux[i]; - } - best_mad = mad; - scale = this_scale; - min = this_min; - } - } - } - *the_min = -min; - return scale; -} - -static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) { - if (j < 4) { - *d = q[j] & 63; *m = q[j + 4] & 63; - } else { - *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - } -} - -//========================- 2-bit (de)-quantization - -void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - quantize_row_q2_K_ref(x, vy, k); -} - -//========================= 3-bit (de)-quantization - -void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - quantize_row_q3_K_ref(x, vy, k); -} - -// ====================== 4-bit (de)-quantization - -void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(k % QK_K == 0); - block_q4_K * GGML_RESTRICT y = vy; - quantize_row_q4_K_ref(x, y, k); -} - -// ====================== 5-bit (de)-quantization - -void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(k % QK_K == 0); - block_q5_K * GGML_RESTRICT y = vy; - quantize_row_q5_K_ref(x, y, k); -} - -// ====================== 6-bit (de)-quantization - -void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(k % QK_K == 0); - block_q6_K * GGML_RESTRICT y = vy; - quantize_row_q6_K_ref(x, y, k); -} - -// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs) - -void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(k % QK_K == 0); - block_tq1_0 * GGML_RESTRICT y = vy; - quantize_row_tq1_0_ref(x, y, k); -} - -void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(k % QK_K == 0); - block_tq2_0 * GGML_RESTRICT y = vy; - quantize_row_tq2_0_ref(x, y, k); -} - -static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - -//===================================== Q8_K ============================================== - -void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { -#ifdef __wasm_simd128__ - assert(k % QK_K == 0); - const int64_t nb = k / QK_K; - block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type - - for (int i = 0; i < nb; i++) { - const float * x_block = x + i * QK_K; - - v128_t min_vec = wasm_v128_load(x_block); - v128_t max_vec = min_vec; - - for (int j = 4; j < QK_K; j += 4) { - v128_t x_vec = wasm_v128_load(x_block + j); - max_vec = wasm_f32x4_pmax(max_vec, x_vec); - min_vec = wasm_f32x4_pmin(min_vec, x_vec); - } - max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1)); - max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2)); - min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1)); - min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2)); - float max = wasm_f32x4_extract_lane(max_vec, 0); - float min = wasm_f32x4_extract_lane(min_vec, 0); - float amax = -min > max ? min : max; - - if (amax == 0.0f) { - yc[i].d = 0.0f; - const v128_t zero = wasm_i8x16_splat(0); - for (int j = 0; j < QK_K; j += 16) { - wasm_v128_store(yc[i].qs + j, zero); - } - continue; - } - - const float iscale = -127.0f / amax; - const v128_t scale_vec = wasm_f32x4_splat(iscale); - - // Process 16 elements per iteration - for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) { - // Load and quantize 16 floats - v128_t x0 = wasm_v128_load(x_block + j); - v128_t x1 = wasm_v128_load(x_block + j + 4); - v128_t x2 = wasm_v128_load(x_block + j + 8); - v128_t x3 = wasm_v128_load(x_block + j + 12); - - v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec)); - v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec)); - v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec)); - v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec)); - - // Convert to i32 with saturation - v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0); - v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1); - v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2); - v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3); - - // Pack into 16 i8 values - v128_t i8 = wasm_i8x16_narrow_i16x8( - wasm_i16x8_narrow_i32x4(i0, i1), - wasm_i16x8_narrow_i32x4(i2, i3) - ); - wasm_v128_store(yc[i].qs + j, i8); - - // Calculate bsums using SIMD - v128_t sum16 = wasm_i16x8_add( - wasm_i16x8_extend_low_i8x16(i8), - wasm_i16x8_extend_high_i8x16(i8) - ); - v128_t sum32 = wasm_i32x4_add( - wasm_i32x4_extend_low_i16x8(sum16), - wasm_i32x4_extend_high_i16x8(sum16) - ); - sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1)); - sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2)); - yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0); - } - - yc[i].d = 1.0f / iscale; - } -#else - quantize_row_q8_K_ref(x, y, k); -#endif -} - -//===================================== Dot products ================================= - -// -// Helper functions -// -#if __AVX__ || __AVX2__ || __AVX512F__ - -// shuffles to pick the required scales in dot products -static inline __m256i get_scale_shuffle_q3k(int i) { - static const uint8_t k_shuffle[128] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, - 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - }; - return _mm256_loadu_si256((const __m256i*)k_shuffle + i); -} -static inline __m256i get_scale_shuffle_k4(int i) { - static const uint8_t k_shuffle[256] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, - 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, - 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 - }; - return _mm256_loadu_si256((const __m256i*)k_shuffle + i); -} -static inline __m128i get_scale_shuffle(int i) { - static const uint8_t k_shuffle[128] = { - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, - 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, - 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11, - 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13, - 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15 - }; - return _mm_loadu_si128((const __m128i*)k_shuffle + i); -} -#elif defined(__loongarch_asx) -// shuffles to pick the required scales in dot products -static inline __m256i get_scale_shuffle_q3k(int i) { - static const uint8_t k_shuffle[128] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, - 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - }; - return __lasx_xvld((const __m256i*)k_shuffle + i, 0); -} -static inline __m256i get_scale_shuffle_k4(int i) { - static const uint8_t k_shuffle[256] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, - 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, - 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 - }; - return __lasx_xvld((const __m256i*)k_shuffle + i, 0); -} -static inline __m128i get_scale_shuffle(int i) { - static const uint8_t k_shuffle[128] = { - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, - 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, - 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11, - 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13, - 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15 - }; - return __lsx_vld((const __m128i*)k_shuffle + i, 0); -} -#endif - -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); -#if defined(__ARM_FEATURE_MATMUL_INT8) - assert((nrc == 2) || (nrc == 1)); -#else - assert(nrc == 1); -#endif - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q4_0 * GGML_RESTRICT x = vx; - const block_q8_0 * GGML_RESTRICT y = vy; - -#if defined(__ARM_FEATURE_MATMUL_INT8) - if (nrc == 2) { - const block_q4_0 * GGML_RESTRICT vx0 = vx; - const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx); - const block_q8_0 * GGML_RESTRICT vy0 = vy; - const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); - - float32x4_t sumv0 = vdupq_n_f32(0.0f); - - for (int i = 0; i < nb; i++) { - const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i]; - const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i]; - const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i]; - const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - const int8x16_t s8b = vdupq_n_s8(0x8); - - const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); - const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // sub 8 - const int8x16_t x0_l = vsubq_s8(v0_0l, s8b); - const int8x16_t x0_h = vsubq_s8(v0_0h, s8b); - const int8x16_t x1_l = vsubq_s8(v0_1l, s8b); - const int8x16_t x1_h = vsubq_s8(v0_1h, s8b); - - // load y - const int8x16_t y0_l = vld1q_s8(b_y0->qs); - const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); - const int8x16_t y1_l = vld1q_s8(b_y1->qs); - const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); - - float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) - }; - float32x4_t scale = vld1q_f32(_scale); - - int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); - int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); - - int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); - int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); - - int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); - int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); - - int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); - int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); - - sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), - l1, r1)), l2, r2)), l3, r3))), scale); - } - - float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); - float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); - - vst1_f32(s, vget_low_f32 (sumv2)); - vst1_f32(s + bs, vget_high_f32(sumv2)); - - return; - } -#endif - - int ib = 0; - float sumf = 0; - -#if defined(__ARM_FEATURE_SVE) - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); - - const int vector_length = ggml_cpu_get_sve_cnt()*8; - - // VLA Implementation using switch case - switch (vector_length) { - case 128: - { - // predicate for activating higher lanes for 4 float32 elements - const svbool_t ph4 = svptrue_pat_b32(SV_VL4); - - for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - // load x - const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); - const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); - - // 4-bit -> 8-bit - const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F)); - const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04)); - const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F)); - const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04)); - - // sub 8 - const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8); - const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8); - const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8); - const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8); - - // load y - const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16); - const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs); - const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16); - - // dot product - sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4, - svdot_s32(svdup_n_s32(0), qx0ls, qy0l), - svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4, - svdot_s32(svdup_n_s32(0), qx1ls, qy1l), - svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); - } break; - case 256: - { - // predicate for activating higher lanes for 16 int8 elements - const svbool_t ph16 = svptrue_pat_b8(SV_VL16); - // predicate for activating lower lanes for 16 int8 elements - const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16); - - for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - // load x - const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); - const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); - - // 4-bit -> 8-bit - const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); - const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); - - // sub 8 - const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); - const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); - - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); - - // dot product - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); - } break; - case 512: - { - // predicate for activating higher lanes for 32 int8 elements - const svbool_t ph32 = svptrue_pat_b8(SV_VL32); - - // predicate for activating higher lanes for 16 int8 elements - const svbool_t ph16 = svptrue_pat_b8(SV_VL16); - // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes - const svbool_t pl16 = svnot_b_z(ph32, ph16); - - for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - // load x - const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs); - const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs); - - // 4-bit -> 8-bit - const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); - const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); - - // sub 8 - const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8); - const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8); - - // load y - const svint8_t qy0 = svld1_s8(ph32, y0->qs); - const svint8_t qy1 = svld1_s8(ph32, y1->qs); - - // dot product - sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32, - svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32, - svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1)); - } break; - default: - assert(false && "Unsupported vector length"); - break; - } - -#elif defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - const int8x16_t s8b = vdupq_n_s8(0x8); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // sub 8 - const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); - const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - - // dot product into int32x4_t - const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); - const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined __wasm_simd128__ - v128_t sumv = wasm_f32x4_splat(0.0f); - - const v128_t m4b = wasm_i8x16_splat(0x0F); - const v128_t s8b = wasm_i8x16_splat(0x8); - - for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * GGML_RESTRICT x0 = &x[ib]; - const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - // Load and process x0 - v128_t v0_0 = wasm_v128_load(x0->qs); - v128_t v0_0l = wasm_v128_and(v0_0, m4b); - v128_t v0_0h = wasm_u8x16_shr(v0_0, 4); - v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b); - v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b); - - // Load y0 vectors - v128_t y0_l = wasm_v128_load(y0->qs); - v128_t y0_h = wasm_v128_load(y0->qs + 16); - - // Extend to i16x8 and compute dot products - v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls); - v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls); - v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs); - v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs); - - v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l); - v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l); - v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h); - v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h); - - v128_t dp0 = wasm_i32x4_add( - wasm_i32x4_add( - wasm_i32x4_dot_i16x8(dx0l, dy0ll), - wasm_i32x4_dot_i16x8(dx0h, dy0lh) - ), - wasm_i32x4_add( - wasm_i32x4_dot_i16x8(dx0hl, dy0hl), - wasm_i32x4_dot_i16x8(dx0hh, dy0hh) - ) - ); - - // Load and process x1 - v128_t v0_1 = wasm_v128_load(x1->qs); - v128_t v0_1l = wasm_v128_and(v0_1, m4b); - v128_t v0_1h = wasm_u8x16_shr(v0_1, 4); - v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b); - v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b); - - // Load y1 vectors - v128_t y1_l = wasm_v128_load(y1->qs); - v128_t y1_h = wasm_v128_load(y1->qs + 16); - - // Extend to i16x8 and compute dot products - v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls); - v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls); - v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs); - v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs); - - v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l); - v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l); - v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h); - v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h); - - v128_t dp1 = wasm_i32x4_add( - wasm_i32x4_add( - wasm_i32x4_dot_i16x8(dx1l, dy1ll), - wasm_i32x4_dot_i16x8(dx1h, dy1lh) - ), - wasm_i32x4_add( - wasm_i32x4_dot_i16x8(dx1hl, dy1hl), - wasm_i32x4_dot_i16x8(dx1hh, dy1hh) - ) - ); - - // Accumulate results with scaling - float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d); - float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d); - - sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0))); - sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1))); - } - - sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + - wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (; ib < nb; ++ib) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); - - __m256i qx = bytes_from_nibbles_32(x[ib].qs); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - qx = _mm256_sub_epi8( qx, off ); - - __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); - - const __m256 q = mul_sum_i8_pairs_float(qx, qy); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( d, q, acc ); - } - - sumf = hsum_float_8(acc); -#elif defined(__AVX__) - __m256 accum = _mm256_setzero_ps(); - for (; ib + 1 < nb; ib += 2) { - const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); - const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); - const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); - const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); - const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); - const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); - - const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8)); - const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8)); - const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8)); - const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8)); - - const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); - const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); - const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); - const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); - const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1); - const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1); - const __m256 p = sum_i16_pairs_float(p_2, p_1); - - const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); - accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); - } - - sumf = hsum_float_8(accum); -#elif defined(__SSSE3__) - // set constants - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); - - // Initialize accumulator with zeros - __m128 acc_0 = _mm_setzero_ps(); - __m128 acc_1 = _mm_setzero_ps(); - __m128 acc_2 = _mm_setzero_ps(); - __m128 acc_3 = _mm_setzero_ps(); - - for (; ib + 1 < nb; ib += 2) { - _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); - - const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs); - - __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); - __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); - __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16)); - bx_1 = _mm_sub_epi8(bx_1, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); - - _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0); - _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) ); - - const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); - - __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); - __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); - bx_2 = _mm_sub_epi8(bx_2, off); - const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); - - __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); - __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16)); - bx_3 = _mm_sub_epi8(bx_3, off); - const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); - - // Convert int32_t to float - __m128 p0 = _mm_cvtepi32_ps(i32_0); - __m128 p1 = _mm_cvtepi32_ps(i32_1); - __m128 p2 = _mm_cvtepi32_ps(i32_2); - __m128 p3 = _mm_cvtepi32_ps(i32_3); - - // Apply the scale - __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); - __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); - __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); - __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); - - // Acummulate - acc_0 = _mm_add_ps(p0_d, acc_0); - acc_1 = _mm_add_ps(p1_d, acc_1); - acc_2 = _mm_add_ps(p2_d, acc_2); - acc_3 = _mm_add_ps(p3_d, acc_3); - } - - sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); -#elif defined(__riscv_v) - size_t vl = qk / 2; - - for (; ib < nb; ++ib) { - // load elements - vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl); - - vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl); - vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl); - - // mask and store lower part of x, and then upper part - vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); - vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); - - vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); - vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); - - // subtract offset - vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl); - vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl); - - vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); - vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl); - - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); - } - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector signed int v0 = vec_splats((int32_t)0); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - const vector signed char v8 = vec_splats((signed char)0x8); - - vector float vsumf0 = vec_splats(0.0f); - -#pragma GCC unroll 8 - for (; ib < nb; ++ib) { - __builtin_prefetch(x[ib].qs, 0, 1); - __builtin_prefetch(y[ib].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); - vector float vd = vec_mul(vxd, vyd); - - vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); - vector signed char q8y0 = vec_xl( 0, y[ib].qs); - vector signed char q8y1 = vec_xl(16, y[ib].qs); - - vector signed char q4x0 = vec_and(qxs, lowMask); - vector signed char q4x1 = vec_sr(qxs, v4); - - q4x0 = vec_sub(q4x0, v8); - q4x1 = vec_sub(q4x1, v8); - - vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); - - vector signed int vsumi0 = v0; - - vsumi0 = vec_sum4s(qv0, vsumi0); - vsumi0 = vec_sum4s(qv1, vsumi0); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - } - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - sumf = vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - // Initialize accumulator with zeros - __m256 acc = (__m256)__lasx_xvldi(0); - - // Main loop - for (; ib < nb; ++ib) { - /* Compute combined scale for the block */ - const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); - - __m256i qx = bytes_from_nibbles_32(x[ib].qs); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = __lasx_xvreplgr2vr_b( 8 ); - qx = __lasx_xvsub_b( qx, off ); - - __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); - - const __m256 q = mul_sum_i8_pairs_float(qx, qy); - - /* Multiply q with scale and accumulate */ - acc = __lasx_xvfmadd_s( d, q, acc ); - } - - sumf = hsum_float_8(acc); - -#elif defined(__loongarch_sx) - // set constants - const __m128i low_mask = __lsx_vreplgr2vr_b(0xF); - const __m128i off = __lsx_vreplgr2vr_b(8); - - // Initialize accumulator with zeros - __m128 acc_0 = (__m128)__lsx_vldi(0); - __m128 acc_1 = (__m128)__lsx_vldi(0); - __m128 acc_2 = (__m128)__lsx_vldi(0); - __m128 acc_3 = (__m128)__lsx_vldi(0); - - for (; ib + 1 < nb; ib += 2) { - - // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); - - const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0); - - __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1); - __m128i by_0 = __lsx_vld((const __m128i *)y[ib].qs, 0); - bx_0 = __lsx_vsub_b(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4)); - __m128i by_1 = __lsx_vld((const __m128i *)(y[ib].qs + 16), 0); - bx_1 = __lsx_vsub_b(bx_1, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); - - //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0); - //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); - - // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) ); - - const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0); - - __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3); - __m128i by_2 = __lsx_vld((const __m128i *)y[ib + 1].qs, 0); - bx_2 = __lsx_vsub_b(bx_2, off); - const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); - - __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4)); - __m128i by_3 = __lsx_vld((const __m128i *)(y[ib + 1].qs + 16), 0); - bx_3 = __lsx_vsub_b(bx_3, off); - const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); - - // Convert int32_t to float - __m128 p0 = __lsx_vffint_s_w(i32_0); - __m128 p1 = __lsx_vffint_s_w(i32_1); - __m128 p2 = __lsx_vffint_s_w(i32_2); - __m128 p3 = __lsx_vffint_s_w(i32_3); - - // Apply the scale - __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 ); - __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 ); - __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 ); - __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 ); - - // Acummulate - acc_0 = __lsx_vfadd_s(p0_d, acc_0); - acc_1 = __lsx_vfadd_s(p1_d, acc_1); - acc_2 = __lsx_vfadd_s(p2_d, acc_2); - acc_3 = __lsx_vfadd_s(p3_d, acc_3); - } - - sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); -#elif defined(__VXE__) || defined(__VXE2__) - __vector float acc = vec_splats(0.0f); - - const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F); - const __vector int8_t v_s = vec_splats( (const int8_t)0x08); - - for (; ib < nb; ++ib) { - const __vector uint8_t v_x = vec_xl(0, x[ib].qs); - const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m); - const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4); - - const __vector int8_t v_xls = vec_sub(v_xl, v_s); - const __vector int8_t v_xhs = vec_sub(v_xh, v_s); - - const __vector int8_t v_yl = vec_xl(0 , y[ib].qs); - const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs); - - const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl); - const __vector int16_t v_xylse = vec_mule(v_xls, v_yl); - const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh); - const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh); - - __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_); - - const __vector float v_xy = vec_float(vec_unpackh(v_xy_)); - const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); - - acc = vec_madd(v_xy, v_d, acc); - } - - sumf = acc[0] + acc[1] + acc[2] + acc[3]; -#endif - for (; ib < nb; ++ib) { - int sumi0 = 0; - int sumi1 = 0; - - for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[ib].qs[j] & 0x0F) - 8; - const int v1 = (x[ib].qs[j] >> 4) - 8; - - sumi0 += (v0 * y[ib].qs[j]); - sumi1 += (v1 * y[ib].qs[j + qk/2]); - } - - int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); - } - - *s = sumf; -} - -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - const int qk = QK8_1; - const int nb = n / qk; - - assert(n % qk == 0); -#if defined(__ARM_FEATURE_MATMUL_INT8) - assert((nrc == 2) || (nrc == 1)); -#else - assert(nrc == 1); -#endif - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q4_1 * GGML_RESTRICT x = vx; - const block_q8_1 * GGML_RESTRICT y = vy; - -#if defined(__ARM_FEATURE_MATMUL_INT8) - if (nrc == 2) { - const block_q4_1 * GGML_RESTRICT vx0 = vx; - const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx); - const block_q8_1 * GGML_RESTRICT vy0 = vy; - const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by); - - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t summs0 = vdupq_n_f32(0.0f); - - for (int i = 0; i < nb; i++) { - const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i]; - const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i]; - const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i]; - const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i]; - - float32_t summs_t[4] = { - GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s), - GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s), - GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s), - GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s) - }; - summs0 = vaddq_f32(summs0, vld1q_f32(summs_t)); - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); - const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); - - // 4-bit -> 8-bit - const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // load y - const int8x16_t y0_l = vld1q_s8(b_y0->qs); - const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); - const int8x16_t y1_l = vld1q_s8(b_y1->qs); - const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); - - // mmla into int32x4_t - float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) - }; - float32x4_t scale = vld1q_f32(_scale); - - int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); - int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); - - int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); - int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); - - int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); - int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); - - int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); - int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); - sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), - l1, r1)), l2, r2)), l3, r3))), scale); - } - - float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); - float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); - - sumv2 = vaddq_f32(sumv2, summs0); - - vst1_f32(s, vget_low_f32 (sumv2)); - vst1_f32(s + bs, vget_high_f32(sumv2)); - - return; - } -#endif - - int ib = 0; - float sumf = 0; - - // TODO: add WASM SIMD -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - float summs = 0; - - for (; ib + 1 < nb; ib += 2) { - const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; - - summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s); - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - - // dot product into int32x4_t - const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); - const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; -#elif defined(__AVX2__) || defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - float summs = 0; - - // Main loop - for (; ib < nb; ++ib) { - const float d0 = GGML_FP16_TO_FP32(x[ib].d); - const float d1 = GGML_FP16_TO_FP32(y[ib].d); - - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); - - const __m256 d0v = _mm256_set1_ps( d0 ); - const __m256 d1v = _mm256_set1_ps( d1 ); - - // Compute combined scales - const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); - - // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i qx = bytes_from_nibbles_32(x[ib].qs); - const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs ); - - const __m256 xy = mul_sum_us8_pairs_float(qx, qy); - - // Accumulate d0*d1*x*y -#if defined(__AVX2__) - acc = _mm256_fmadd_ps( d0d1, xy, acc ); -#else - acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); -#endif - } - - sumf = hsum_float_8(acc) + summs; -#elif defined(__riscv_v) - size_t vl = qk / 2; - - for (; ib < nb; ++ib) { - // load elements - vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl); - - vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl); - vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl); - - // mask and store lower part of x, and then upper part - vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); - vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); - - vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); - vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); - - vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); - vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl); - - vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); - vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); - } - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector signed int v0 = vec_splats((int32_t)0); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - -#pragma GCC unroll 4 - for (; ib < nb; ++ib) { - __builtin_prefetch(x[ib].qs, 0, 1); - __builtin_prefetch(y[ib].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); - vector float vd = vec_mul(vxd, vyd); - - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m)); - vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f}; - vsumf0 = vec_madd(vxmin, vys, vsumf0); - - vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); - vector signed char q8y0 = vec_xl( 0, y[ib].qs); - vector signed char q8y1 = vec_xl(16, y[ib].qs); - - vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask); - vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4); - - vector signed int vsumi0 = v0; - - vsumi0 = vec_msum(q8y0, q4x0, vsumi0); - vsumi0 = vec_msum(q8y1, q4x1, vsumi0); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - } - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - sumf = vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - // Initialize accumulator with zeros - __m256 acc = (__m256)__lasx_xvldi(0); - - float summs = 0; - - // Main loop - for (; ib < nb; ++ib) { - const float d0 = GGML_FP16_TO_FP32(x[ib].d); - const float d1 = GGML_FP16_TO_FP32(y[ib].d); - - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); - - const __m256 d0v = __lasx_xvreplfr2vr_s( d0 ); - const __m256 d1v = __lasx_xvreplfr2vr_s( d1 ); - - // Compute combined scales - const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v ); - - // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i qx = bytes_from_nibbles_32(x[ib].qs); - const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0); - - const __m256 xy = mul_sum_us8_pairs_float(qx, qy); - - // Accumulate d0*d1*x*y - acc = __lasx_xvfmadd_s( d0d1, xy, acc ); - } - - sumf = hsum_float_8(acc) + summs; -#elif defined(__VXE__) || defined(__VXE2__) - float summs = 0; - float32x4_t acc = vec_splats(0.0f); - - const uint8x16_t v_m = vec_splat_u8(0x0F); - -#pragma GCC unroll 4 - for (; ib < nb; ++ib) { - __builtin_prefetch(x[ib].qs, 0, 1); - __builtin_prefetch(y[ib].qs, 0, 1); - - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); - - const uint8x16_t v_x = vec_xl(0, x[ib].qs); - const int8x16_t v_xl = (const int8x16_t)(v_x & v_m); - const int8x16_t v_xh = (const int8x16_t)(v_x >> 4); - - const int8x16_t v_yl = vec_xl(0 , y[ib].qs); - const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs); - - const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); - const float32x4_t v_xy = vec_float(v_xy_); - - const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); - - acc = vec_madd(v_xy, v_d, acc); - } - - sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs; -#endif - for (; ib < nb; ++ib) { - int sumi0 = 0; - int sumi1 = 0; - - for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[ib].qs[j] & 0x0F); - const int v1 = (x[ib].qs[j] >> 4); - - sumi0 += (v0 * y[ib].qs[j]); - sumi1 += (v1 * y[ib].qs[j + qk/2]); - } - - int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); - } - - *s = sumf; -} - -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - const int qk = QK8_0; - const int nb = n / qk; - - int ib = 0; - float sumf = 0; - - assert(n % qk == 0); - assert(qk == QK5_0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q5_0 * GGML_RESTRICT x = vx; - const block_q8_0 * GGML_RESTRICT y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - uint32_t qh0; - uint32_t qh1; - - uint64_t tmp0[4]; - uint64_t tmp1[4]; - - for (; ib + 1 < nb; ib += 2) { - const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; - const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - // extract the 5th bit via lookup table ((!b) << 4) - memcpy(&qh0, x0->qh, sizeof(qh0)); - memcpy(&qh1, x1->qh, sizeof(qh1)); - - tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; - tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; - tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; - tmp0[3] = table_b2b_1[(qh0 >> 24) ]; - - tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; - tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; - tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; - tmp1[3] = table_b2b_1[(qh1 >> 24) ]; - - const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); - const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); - const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); - const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) - const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); - const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); - const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); - const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined __wasm_simd128__ - v128_t sumv = wasm_f32x4_splat(0.0f); - - uint32_t qh_; - uint64_t tmp[4]; - - // TODO: check if unrolling this is better - for (; ib < nb; ++ib) { - const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; - - const v128_t m4b = wasm_i8x16_splat(0x0F); - - // extract the 5th bit - memcpy(&qh_, x0->qh, sizeof(qh_)); - - tmp[0] = table_b2b_1[(qh_ >> 0) & 0xFF]; - tmp[1] = table_b2b_1[(qh_ >> 8) & 0xFF]; - tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF]; - tmp[3] = table_b2b_1[(qh_ >> 24) ]; - - const v128_t qhl = wasm_v128_load(tmp + 0); - const v128_t qhh = wasm_v128_load(tmp + 2); - - const v128_t v0 = wasm_v128_load(x0->qs); - - // 4-bit -> 8-bit - const v128_t v0l = wasm_v128_and (v0, m4b); - const v128_t v0h = wasm_u8x16_shr(v0, 4); - - // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) - const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); - const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); - - // load y - const v128_t v1l = wasm_v128_load(y0->qs); - const v128_t v1h = wasm_v128_load(y0->qs + 16); - - // int8x16 -> int16x8 - const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); - const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); - const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); - const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); - - const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); - const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); - const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); - const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - - // dot product - sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( - wasm_i32x4_add( - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), - wasm_i32x4_dot_i16x8(v0lfh, v1lh)), - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), - wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); - } - - sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + - wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (; ib < nb; ++ib) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); - - __m256i qx = bytes_from_nibbles_32(x[ib].qs); - __m256i bxhi = bytes_from_bits_32(x[ib].qh); - bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); - qx = _mm256_or_si256(qx, bxhi); - - __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); - - const __m256 q = mul_sum_i8_pairs_float(qx, qy); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps(d, q, acc); - } - - sumf = hsum_float_8(acc); -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - __m128i mask = _mm_set1_epi8((char)0xF0); - - // Main loop - for (; ib < nb; ++ib) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); - - __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); - const __m256i bxhi = bytes_from_bits_32(x[ib].qh); - __m128i bxhil = _mm256_castsi256_si128(bxhi); - __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); - bxhil = _mm_andnot_si128(bxhil, mask); - bxhih = _mm_andnot_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx_0); - __m128i bxh = _mm256_extractf128_si256(bx_0, 1); - bxl = _mm_or_si128(bxl, bxhil); - bxh = _mm_or_si128(bxh, bxhih); - bx_0 = MM256_SET_M128I(bxh, bxl); - - const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0); - - /* Multiply q with scale and accumulate */ - acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); - } - - sumf = hsum_float_8(acc); -#elif defined(__riscv_v) - size_t vl; - size_t vlenb = __riscv_vlenb(); - - for (; ib < nb; ++ib) { - vl = qk / 2; - vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl); - vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl)); - vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl)); - vint8m2_t v0c; - if (vlenb == 16) { - v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h); - } else { - v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32); - v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l); - } - - vl = qk; - vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl); - qh = __riscv_vmnand_mm_b4(qh, qh, vl); - vint8m2_t v0f = __riscv_vsub_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl); - vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl); - vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl); - vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl); - vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl); - int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum); - - sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi; - } - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector unsigned char v4 = vec_splats((unsigned char)4); - - vector float vsumf0 = vec_splats(0.0f); - -#pragma GCC unroll 4 - for (; ib < nb; ++ib) { - __builtin_prefetch(x[ib].qs, 0, 1); - __builtin_prefetch(y[ib].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); - vector float vd = vec_mul(vxd, vyd); - - vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])}; - vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])}; - - vector signed char qh0 = (vector signed char)aux64x2_0; - vector signed char qh1 = (vector signed char)aux64x2_1; - - vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); - - vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0); - vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1); - - vector signed char q8y0 = vec_xl( 0, y[ib].qs); - vector signed char q8y1 = vec_xl( 16, y[ib].qs); - - vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1)); - - qv0 = vec_add(qv0, qv1); - - vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - } - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - sumf = vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - // Initialize accumulator with zeros - __m256 acc = (__m256)__lasx_xvldi(0); - - // Main loop - for (; ib < nb; ++ib) { - /* Compute combined scale for the block */ - const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME - - __m256i qx = bytes_from_nibbles_32(x[ib].qs); - __m256i bxhi = bytes_from_bits_32(x[ib].qh); - bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0)); - qx = __lasx_xvor_v(qx, bxhi); - - __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); - - const __m256 q = mul_sum_i8_pairs_float(qx, qy); - - /* Multiply q with scale and accumulate */ - acc = __lasx_xvfmadd_s(d, q, acc); - } - - sumf = hsum_float_8(acc); -#endif - for (; ib < nb; ++ib) { - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - int sumi0 = 0; - int sumi1 = 0; - - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - - const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); - const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); - - sumi0 += (x0 * y[ib].qs[j]); - sumi1 += (x1 * y[ib].qs[j + qk/2]); - } - - int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; - } - - *s = sumf; -} - -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - const int qk = QK8_1; - const int nb = n / qk; - - int ib = 0; - float sumf = 0; - - assert(n % qk == 0); - assert(qk == QK5_1); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q5_1 * GGML_RESTRICT x = vx; - const block_q8_1 * GGML_RESTRICT y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - float summs0 = 0.0f; - float summs1 = 0.0f; - - uint32_t qh0; - uint32_t qh1; - - uint64_t tmp0[4]; - uint64_t tmp1[4]; - - for (; ib + 1 < nb; ib += 2) { - const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; - const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; - const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - - summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s); - summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s); - - // extract the 5th bit via lookup table ((b) << 4) - memcpy(&qh0, x0->qh, sizeof(qh0)); - memcpy(&qh1, x1->qh, sizeof(qh1)); - - tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; - tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; - tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; - tmp0[3] = table_b2b_0[(qh0 >> 24) ]; - - tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; - tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; - tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; - tmp1[3] = table_b2b_0[(qh1 >> 24) ]; - - const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); - const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); - const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); - const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // add high bit - const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); - const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); - const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); - const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; -#elif defined __wasm_simd128__ - v128_t sumv = wasm_f32x4_splat(0.0f); - - float summs = 0.0f; - - uint32_t qh_; - uint64_t tmp[4]; - - // TODO: check if unrolling this is better - for (; ib < nb; ++ib) { - const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; - const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; - - summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s); - - const v128_t m4b = wasm_i8x16_splat(0x0F); - - // extract the 5th bit - memcpy(&qh_, x0->qh, sizeof(qh_)); - - tmp[0] = table_b2b_0[(qh_ >> 0) & 0xFF]; - tmp[1] = table_b2b_0[(qh_ >> 8) & 0xFF]; - tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF]; - tmp[3] = table_b2b_0[(qh_ >> 24) ]; - - const v128_t qhl = wasm_v128_load(tmp + 0); - const v128_t qhh = wasm_v128_load(tmp + 2); - - const v128_t v0 = wasm_v128_load(x0->qs); - - // 4-bit -> 8-bit - const v128_t v0l = wasm_v128_and (v0, m4b); - const v128_t v0h = wasm_u8x16_shr(v0, 4); - - // add high bit - const v128_t v0lf = wasm_v128_or(v0l, qhl); - const v128_t v0hf = wasm_v128_or(v0h, qhh); - - // load y - const v128_t v1l = wasm_v128_load(y0->qs); - const v128_t v1h = wasm_v128_load(y0->qs + 16); - - // int8x16 -> int16x8 - const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); - const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); - const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); - const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); - - const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); - const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); - const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); - const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - - // dot product - sumv = wasm_f32x4_add(sumv, - wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add( - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), - wasm_i32x4_dot_i16x8(v0lfh, v1lh)), - wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), - wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); - } - - sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + - wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - float summs = 0.0f; - - // Main loop - for (; ib < nb; ++ib) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d)); - - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); - - __m256i qx = bytes_from_nibbles_32(x[ib].qs); - __m256i bxhi = bytes_from_bits_32(x[ib].qh); - bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); - qx = _mm256_or_si256(qx, bxhi); - - const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d)); - const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); - - const __m256 q = mul_sum_us8_pairs_float(qx, qy); - - acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); - } - - sumf = hsum_float_8(acc) + summs; -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - __m128i mask = _mm_set1_epi8(0x10); - - float summs = 0.0f; - - // Main loop - for (; ib < nb; ++ib) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d)); - - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); - - __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); - const __m256i bxhi = bytes_from_bits_32(x[ib].qh); - __m128i bxhil = _mm256_castsi256_si128(bxhi); - __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); - bxhil = _mm_and_si128(bxhil, mask); - bxhih = _mm_and_si128(bxhih, mask); - __m128i bxl = _mm256_castsi256_si128(bx_0); - __m128i bxh = _mm256_extractf128_si256(bx_0, 1); - bxl = _mm_or_si128(bxl, bxhil); - bxh = _mm_or_si128(bxh, bxhih); - bx_0 = MM256_SET_M128I(bxh, bxl); - - const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d)); - const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs); - - const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0); - - acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); - } - - sumf = hsum_float_8(acc) + summs; -#elif defined(__riscv_v) - size_t vl; - size_t vlenb = __riscv_vlenb(); - - for (; ib < nb; ++ib) { - vl = qk / 2; - vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl); - vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl)); - vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl)); - vint8m2_t v0c; - if (vlenb == 16) { - v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h); - } else { - v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32); - v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l); - } - - vl = qk; - vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl); - vint8m2_t v0f = __riscv_vor_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl); - vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl); - vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl); - vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl); - vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl); - int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum); - - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); - } - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector signed int v0 = vec_splats((int32_t)0); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - -#pragma GCC unroll 4 - for (; ib < nb; ++ib) { - __builtin_prefetch(x[ib].qs, 0, 1); - __builtin_prefetch(y[ib].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); - vector float vd = vec_mul(vxd, vyd); - - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m)); - vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f}; - vsumf0 = vec_madd(vxmin, vys, vsumf0); - - vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])}; - vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])}; - - vector signed char qh0 = (vector signed char)aux64x2_0; - vector signed char qh1 = (vector signed char)aux64x2_1; - - vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); - - vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0); - vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1); - - vector signed char q8y0 = vec_xl( 0, y[ib].qs); - vector signed char q8y1 = vec_xl( 16, y[ib].qs); - - vector signed int vsumi0 = v0; - - vsumi0 = vec_msum(q8y0, q5x0, vsumi0); - vsumi0 = vec_msum(q8y1, q5x1, vsumi0); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - } - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - sumf = vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - // Initialize accumulator with zeros - __m256 acc = (__m256)__lasx_xvldi(0); - - float summs = 0.0f; - - // Main loop - for (; ib < nb; ++ib) { - const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d)); - - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); - - __m256i qx = bytes_from_nibbles_32(x[ib].qs); - __m256i bxhi = bytes_from_bits_32(x[ib].qh); - bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10)); - qx = __lasx_xvor_v(qx, bxhi); - - const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d)); - const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); - - const __m256 q = mul_sum_us8_pairs_float(qx, qy); - - acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc); - } - - sumf = hsum_float_8(acc) + summs; -#endif - for (; ib < nb; ++ib) { - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - int sumi0 = 0; - int sumi1 = 0; - - for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; - - const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; - const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; - - sumi0 += (x0 * y[ib].qs[j]); - sumi1 += (x1 * y[ib].qs[j + qk/2]); - } - - int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); - } - - *s = sumf; -} - -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); -#if defined(__ARM_FEATURE_MATMUL_INT8) - assert((nrc == 2) || (nrc == 1)); -#else - assert(nrc == 1); -#endif - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q8_0 * GGML_RESTRICT x = vx; - const block_q8_0 * GGML_RESTRICT y = vy; - -#if defined(__ARM_FEATURE_MATMUL_INT8) - if (nrc == 2) { - const block_q8_0 * GGML_RESTRICT vx0 = vx; - const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx); - const block_q8_0 * GGML_RESTRICT vy0 = vy; - const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); - - float32x4_t sumv0 = vdupq_n_f32(0.0f); - - for (int i = 0; i < nb; i++) { - const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i]; - const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i]; - - const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i]; - const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i]; - - const int8x16_t x0_l = vld1q_s8(b_x0->qs); - const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16); - const int8x16_t x1_l = vld1q_s8(b_x1->qs); - const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16); - - // load y - const int8x16_t y0_l = vld1q_s8(b_y0->qs); - const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); - const int8x16_t y1_l = vld1q_s8(b_y1->qs); - const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); - - float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) - }; - float32x4_t scale = vld1q_f32(_scale); - - int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); - int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); - - int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); - int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); - - int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); - int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); - - int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); - int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); - - sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), - l1, r1)), l2, r2)), l3, r3))), scale); - } - - float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); - float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); - - vst1_f32(s, vget_low_f32 (sumv2)); - vst1_f32(s + bs, vget_high_f32(sumv2)); - - return; - } -#endif - - int ib = 0; - float sumf = 0; - -#if defined(__ARM_FEATURE_SVE) - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); - - const int vector_length = ggml_cpu_get_sve_cnt()*8; - - //VLA Implemenation for SVE - switch (vector_length) { - case 128: - { - // predicate for activating lanes for 16 Int8 elements - const svbool_t ph16 = svptrue_pat_b8 (SV_VL16); - const svbool_t pl16 = svptrue_pat_b32(SV_VL4); - - for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - // load x - const svint8_t qx0_0 = svld1_s8(ph16, x0->qs); - const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16); - const svint8_t qx1_0 = svld1_s8(ph16, x1->qs); - const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16); - - // load y - const svint8_t qy0_0 = svld1_s8(ph16, y0->qs); - const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16); - const svint8_t qy1_0 = svld1_s8(ph16, y1->qs); - const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16); - - sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16, - svdot_s32(svdup_n_s32(0), qx0_0, qy0_0), - svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16, - svdot_s32(svdup_n_s32(0), qx1_0, qy1_0), - svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1)); - } break; - case 256: - { - //printf("sve256"); - for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - // load x - const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); - const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); - - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); - - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); - } break; - case 512: - { - // predicate for activating high 256 bit - const svbool_t ph32 = svptrue_pat_b8(SV_VL32); - // predicate for activating low 256 bit - const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32); - - // predicate for activating high lanes for 8 float32 elements - const svbool_t ph8 = svptrue_pat_b32(SV_VL8); - // predicate for activating low lanes for 8 float32 elements - const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8); - - svfloat32_t sumv00 = svdup_n_f32(0.0f); - - for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits - // and add them to make one 64 element vector - // load x - const svint8_t qx_32 = svld1_s8(ph32, x0->qs); - svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2); - - qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64); - - // load y - const svint8_t qy_32 = svld1_s8(ph32, y0->qs); - svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2); - - qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64); - - // scale creation - const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d); - const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d); - - // duplicate deq1 in first half of vector and deq2 in second half of vector - const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2); - - const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64)); - - sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp); - } - - sumf = svaddv_f32(svptrue_b32(), sumv00); - break; - } - default: - assert(false && "Unsupported vector length"); - break; - } -#elif defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0]; - const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; - const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - - const int8x16_t x0_0 = vld1q_s8(x0->qs); - const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); - const int8x16_t x1_0 = vld1q_s8(x1->qs); - const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); - - // load y - const int8x16_t y0_0 = vld1q_s8(y0->qs); - const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); - const int8x16_t y1_0 = vld1q_s8(y1->qs); - const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined __wasm_simd128__ - v128_t sumv = wasm_f32x4_splat(0.0f); - - for (; ib < nb; ++ib) { - const block_q8_0 * GGML_RESTRICT x0 = &x[ib]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; - - const v128_t x0_0 = wasm_v128_load(x0->qs); - const v128_t x0_1 = wasm_v128_load(x0->qs + 16); - const v128_t y0_0 = wasm_v128_load(y0->qs); - const v128_t y0_1 = wasm_v128_load(y0->qs + 16); - - // Extend 8-bit to 16-bit - const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0); - const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0); - const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1); - const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1); - - const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0); - const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0); - const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1); - const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1); - - // Compute dot products - const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l); - const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h); - const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l); - const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h); - - // Sum all dot products - const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1)); - - // Convert to float and accumulate - const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d); - sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale))); - } - - sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + - wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (; ib < nb; ++ib) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); - __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs); - __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); - - const __m256 q = mul_sum_i8_pairs_float(qx, qy); - - // Multiply q with scale and accumulate - acc = _mm256_fmadd_ps( d, q, acc ); - } - - sumf = hsum_float_8(acc); -#elif defined(__AVX__) - __m256 accum = _mm256_setzero_ps(); - - for (; ib + 1 < nb; ib += 2) { - const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs); - const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1); - const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); - const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1); - const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); - const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1); - const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); - const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); - - const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1); - const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); - accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); - } - - sumf = hsum_float_8(accum); -#elif defined(__riscv_v) - size_t vl = qk; - - for (; ib < nb; ++ib) { - // load elements - vint8m2_t bx_0 = __riscv_vle8_v_i8m2(x[ib].qs, vl); - vint8m2_t by_0 = __riscv_vle8_v_i8m2(y[ib].qs, vl); - - vint16m4_t vw_mul = __riscv_vwmul_vv_i16m4(bx_0, by_0, vl); - - vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); - vint32m1_t v_sum = __riscv_vwredsum_vs_i16m4_i32m1(vw_mul, v_zero, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); - - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); - } -#elif defined(__POWER9_VECTOR__) - const vector signed int v0 = vec_splats((int32_t)0); - vector float vsumf0 = vec_splats(0.0f); - -#pragma GCC unroll 8 - for (; ib < nb; ++ib) { - __builtin_prefetch(x[ib].qs, 0, 1); - __builtin_prefetch(y[ib].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); - vector float vd = vec_mul(vxd, vyd); - - vector signed char q8x0 = vec_xl( 0, x[ib].qs); - vector signed char q8x1 = vec_xl(16, x[ib].qs); - vector signed char q8y0 = vec_xl( 0, y[ib].qs); - vector signed char q8y1 = vec_xl(16, y[ib].qs); - - vector signed short qv0 = vec_mule(q8x0, q8y0); - vector signed short qv1 = vec_mulo(q8x0, q8y0); - vector signed short qv2 = vec_mule(q8x1, q8y1); - vector signed short qv3 = vec_mulo(q8x1, q8y1); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - - vsumi0 = vec_sum4s(qv0, vsumi0); - vsumi1 = vec_sum4s(qv1, vsumi1); - vsumi0 = vec_sum4s(qv2, vsumi0); - vsumi1 = vec_sum4s(qv3, vsumi1); - - vsumi0 = vec_add(vsumi0, vsumi1); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - } - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - sumf = vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - // Initialize accumulator with zeros - __m256 acc = (__m256)__lasx_xvldi(0); - - // Main loop - for (; ib < nb; ++ib) { - // Compute combined scale for the block - const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); - __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0); - __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); - - const __m256 q = mul_sum_i8_pairs_float(qx, qy); - - // Multiply q with scale and accumulate - acc = __lasx_xvfmadd_s( d, q, acc ); - } - - sumf = hsum_float_8(acc); -#elif defined(__VXE__) || defined(__VXE2__) - __vector float acc = vec_splats(0.0f); - -#pragma GCC unroll 8 - for (; ib < nb; ++ib) { - __builtin_prefetch(x[ib].qs, 0, 1); - __builtin_prefetch(y[ib].qs, 0, 1); - - const int8x16_t v_xl = vec_xl(0 , x[ib].qs); - const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs); - const int8x16_t v_yl = vec_xl(0 , y[ib].qs); - const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs); - - const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); - const float32x4_t v_xy = vec_float(v_xy_); - const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); - - acc = vec_madd(v_xy, v_d, acc); - } - - sumf = acc[0] + acc[1] + acc[2] + acc[3]; -#endif - for (; ib < nb; ++ib) { - int sumi = 0; - - for (int j = 0; j < qk; j++) { - sumi += x[ib].qs[j]*y[ib].qs[j]; - } - - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); - } - - *s = sumf; -} - -void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_tq1_0 * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_NEON) - float sumf = 0.0f; - - uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27}; - - const uint8x16_t shift = vld1q_u8(k_shift); - - for (int i = 0; i < nb; ++i) { -#if defined(__ARM_FEATURE_DOTPROD) - int32x4_t sumi0 = vdupq_n_s32(0); - int32x4_t sumi1 = vdupq_n_s32(0); -#else - int16x8_t sumi0 = vdupq_n_s16(0); - int16x8_t sumi1 = vdupq_n_s16(0); -#endif - - // first 32 bytes of 5 elements - { - uint8x16_t qx0 = vld1q_u8(x[i].qs + 0); - uint8x16_t qx1 = vld1q_u8(x[i].qs + 16); - uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3)); - uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3)); - uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9)); - uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9)); - uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27)); - uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27)); - uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81)); - uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81)); - - // multiply by 3 and keep the 2 bits above 8 bits - int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); - int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); - int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); - int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); - int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); - int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); - int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6)); - int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6)); - int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6)); - int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6)); - - const int8x16_t qy0 = vld1q_s8(y[i].qs + 0); - const int8x16_t qy1 = vld1q_s8(y[i].qs + 16); - const int8x16_t qy2 = vld1q_s8(y[i].qs + 32); - const int8x16_t qy3 = vld1q_s8(y[i].qs + 48); - const int8x16_t qy4 = vld1q_s8(y[i].qs + 64); - const int8x16_t qy5 = vld1q_s8(y[i].qs + 80); - const int8x16_t qy6 = vld1q_s8(y[i].qs + 96); - const int8x16_t qy7 = vld1q_s8(y[i].qs + 112); - const int8x16_t qy8 = vld1q_s8(y[i].qs + 128); - const int8x16_t qy9 = vld1q_s8(y[i].qs + 144); - -#if defined(__ARM_FEATURE_DOTPROD) - sumi0 = vdotq_s32(sumi0, sqx0, qy0); - sumi1 = vdotq_s32(sumi1, sqx1, qy1); - sumi0 = vdotq_s32(sumi0, sqx2, qy2); - sumi1 = vdotq_s32(sumi1, sqx3, qy3); - sumi0 = vdotq_s32(sumi0, sqx4, qy4); - sumi1 = vdotq_s32(sumi1, sqx5, qy5); - sumi0 = vdotq_s32(sumi0, sqx6, qy6); - sumi1 = vdotq_s32(sumi1, sqx7, qy7); - sumi0 = vdotq_s32(sumi0, sqx8, qy8); - sumi1 = vdotq_s32(sumi1, sqx9, qy9); -#else - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9)); -#endif - } - - // last 16 bytes of 5-element, along with the 4 bytes of 4 elements - { - uint8x16_t qx0 = vld1q_u8(x[i].qs + 32); - uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3)); - uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9)); - uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27)); - uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81)); - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned - uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh)); - qx5 = vmulq_u8(qx5, shift); - - // multiply by 3 and keep the 2 bits above 8 bits - int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6)); - int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6)); - int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6)); - int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6)); - int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6)); - int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6)); - - const int8x16_t qy0 = vld1q_s8(y[i].qs + 160); - const int8x16_t qy1 = vld1q_s8(y[i].qs + 176); - const int8x16_t qy2 = vld1q_s8(y[i].qs + 192); - const int8x16_t qy3 = vld1q_s8(y[i].qs + 208); - const int8x16_t qy4 = vld1q_s8(y[i].qs + 224); - const int8x16_t qy5 = vld1q_s8(y[i].qs + 240); - -#if defined(__ARM_FEATURE_DOTPROD) - sumi0 = vdotq_s32(sumi0, sqx0, qy0); - sumi1 = vdotq_s32(sumi1, sqx1, qy1); - sumi0 = vdotq_s32(sumi0, sqx2, qy2); - sumi1 = vdotq_s32(sumi1, sqx3, qy3); - sumi0 = vdotq_s32(sumi0, sqx4, qy4); - sumi1 = vdotq_s32(sumi1, sqx5, qy5); -#else - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); -#endif - } - - const int16x8_t ysum0 = vld1q_s16(y[i].bsums); - const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - -#if defined(__ARM_FEATURE_DOTPROD) - sumi0 = vaddq_s32(sumi0, sumi1); - sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); - - sumf += d * (float) vaddvq_s32(sumi0); -#else - sumi0 = vaddq_s16(sumi0, sumi1); - sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1)); - - sumf += d * (float) vaddlvq_s16(sumi0); -#endif - } - - *s = sumf; - -#elif defined(__AVX2__) - __m256 sumf = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - // 16-bit sums - __m256i sumi0 = _mm256_setzero_si256(); - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); - - // first 32 bytes of 5 elements - { - __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs)); - // 8-bit multiplies with shifts, masks and adds - __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3 - __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9 - __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9 - __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9 - - // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits? - - // Cancel the +1 from avg so that it behaves like a halving add - qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1)); - qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1)); - qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1)); - qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1)); - qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1)); - // Multiply by 3 and get the top 2 bits - qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256())); - qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256())); - qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256())); - qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256())); - qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256())); - qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3)); - qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3)); - qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3)); - qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3)); - qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3)); - - const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0)); - const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32)); - const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64)); - const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96)); - const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128)); - - qx0 = _mm256_maddubs_epi16(qx0, qy0); - qx1 = _mm256_maddubs_epi16(qx1, qy1); - qx2 = _mm256_maddubs_epi16(qx2, qy2); - qx3 = _mm256_maddubs_epi16(qx3, qy3); - qx4 = _mm256_maddubs_epi16(qx4, qy4); - - sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); - sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); - sumi2 = _mm256_add_epi16(sumi2, qx4); - } - - // last 16 bytes of 5-element, along with the 4 bytes of 4 elements - { - __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32)); - uint32_t qh; - memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned - __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh)); - __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3 - __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9 - __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9 - __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9 - __m256i qx01 = MM256_SET_M128I(qx1, qx0); - __m256i qx23 = MM256_SET_M128I(qx3, qx2); - - // avx2 does not have 8-bit multiplies, so 16-bit it is. - qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1)); - qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF)); - __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1)); - - __m256i qx45 = MM256_SET_M128I(qx5, qx4); - - // Cancel the +1 from avg so that it behaves like a halving add - qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1)); - qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1)); - qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1)); - // Multiply by 3 and get the top 2 bits - qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256())); - qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256())); - qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256())); - qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3)); - qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3)); - qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3)); - - const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160)); - const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192)); - const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224)); - - qx01 = _mm256_maddubs_epi16(qx01, qy01); - qx23 = _mm256_maddubs_epi16(qx23, qy23); - qx45 = _mm256_maddubs_epi16(qx45, qy45); - - sumi0 = _mm256_add_epi16(sumi0, qx01); - sumi1 = _mm256_add_epi16(sumi1, qx23); - sumi2 = _mm256_add_epi16(sumi2, qx45); - } - - const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); - - sumi0 = _mm256_sub_epi16(sumi0, ysum); - sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2)); - sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); - - sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); - } - - *s = hsum_float_8(sumf); - -#else - const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; - - float sumf = 0.0f; - - for (int i = 0; i < nb; ++i) { - int sum = 0; - - for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) { - for (size_t l = 0; l < 5; ++l) { - for (size_t m = 0; m < 32; ++m) { - uint8_t q = x[i].qs[j + m] * pow3[l]; - uint16_t xi = ((uint16_t) q * 3) >> 8; - sum += (xi - 1) * y[i].qs[j*5 + l*32 + m]; - } - } - } - for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) { - for (size_t l = 0; l < 5; ++l) { - for (size_t m = 0; m < 16; ++m) { - uint8_t q = x[i].qs[j + m] * pow3[l]; - uint16_t xi = ((uint16_t) q * 3) >> 8; - sum += (xi - 1) * y[i].qs[j*5 + l*16 + m]; - } - } - } - - for (size_t l = 0; l < 4; ++l) { - for (size_t j = 0; j < sizeof(x->qh); ++j) { - uint8_t q = x[i].qh[j] * pow3[l]; - uint16_t xi = ((uint16_t) q * 3) >> 8; - sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j]; - } - } - - sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); - } - - *s = sumf; -#endif -} - -void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_tq2_0 * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_NEON) - float sumf = 0.0f; - - const uint8x16_t m3 = vdupq_n_u8(3); - - for (int i = 0; i < nb; ++i) { -#if defined(__ARM_FEATURE_DOTPROD) - int32x4_t sumi0 = vdupq_n_s32(0); - int32x4_t sumi1 = vdupq_n_s32(0); -#else - int16x8_t sumi0 = vdupq_n_s16(0); - int16x8_t sumi1 = vdupq_n_s16(0); -#endif - - for (size_t j = 0; j < sizeof(x->qs); j += 32) { - uint8x16_t qx0 = vld1q_u8(x[i].qs + j); - uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16); - uint8x16_t qx2 = vshrq_n_u8(qx0, 2); - uint8x16_t qx3 = vshrq_n_u8(qx1, 2); - uint8x16_t qx4 = vshrq_n_u8(qx0, 4); - uint8x16_t qx5 = vshrq_n_u8(qx1, 4); - uint8x16_t qx6 = vshrq_n_u8(qx0, 6); - uint8x16_t qx7 = vshrq_n_u8(qx1, 6); - - int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3)); - int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3)); - int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3)); - int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3)); - int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3)); - int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3)); - int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3)); - int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3)); - - const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 + 0); - const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 + 16); - const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 + 32); - const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 + 48); - const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 + 64); - const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 + 80); - const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 + 96); - const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112); - -#if defined(__ARM_FEATURE_DOTPROD) - sumi0 = vdotq_s32(sumi0, sqx0, qy0); - sumi1 = vdotq_s32(sumi1, sqx1, qy1); - sumi0 = vdotq_s32(sumi0, sqx2, qy2); - sumi1 = vdotq_s32(sumi1, sqx3, qy3); - sumi0 = vdotq_s32(sumi0, sqx4, qy4); - sumi1 = vdotq_s32(sumi1, sqx5, qy5); - sumi0 = vdotq_s32(sumi0, sqx6, qy6); - sumi1 = vdotq_s32(sumi1, sqx7, qy7); -#else - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6)); - sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7)); - sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7)); -#endif - } - - const int16x8_t ysum0 = vld1q_s16(y[i].bsums); - const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - -#if defined(__ARM_FEATURE_DOTPROD) - sumi0 = vaddq_s32(sumi0, sumi1); - sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1))); - - sumf += d * (float) vaddvq_s32(sumi0); -#else - sumi0 = vaddq_s16(sumi0, sumi1); - sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1)); - - sumf += d * (float) vaddlvq_s16(sumi0); -#endif - } - - *s = sumf; - -#elif defined(__AVX2__) - __m256 sumf = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - // 16-bit sums, because 256*127 still fits - __m256i sumi0 = _mm256_setzero_si256(); - __m256i sumi1 = _mm256_setzero_si256(); - - for (size_t j = 0; j < sizeof(x->qs); j += 32) { - __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j)); - __m256i qx1 = _mm256_srli_epi16(qx0, 2); - __m256i qx2 = _mm256_srli_epi16(qx0, 4); - __m256i qx3 = _mm256_srli_epi16(qx0, 6); - - // 0, 1, 2 (should not be 3) - qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3)); - qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3)); - qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3)); - qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3)); - - const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0)); - const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32)); - const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64)); - const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96)); - - qx0 = _mm256_maddubs_epi16(qx0, qy0); - qx1 = _mm256_maddubs_epi16(qx1, qy1); - qx2 = _mm256_maddubs_epi16(qx2, qy2); - qx3 = _mm256_maddubs_epi16(qx3, qy3); - - sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); - sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); - } - - const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); - - sumi0 = _mm256_add_epi16(sumi0, sumi1); - sumi0 = _mm256_sub_epi16(sumi0, ysum); - sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); - - sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); - } - - *s = hsum_float_8(sumf); - -#else - float sumf = 0.0f; - - for (int i = 0; i < nb; ++i) { - int32_t sumi = 0; - - for (size_t j = 0; j < sizeof(x->qs); j += 32) { - for (size_t l = 0; l < 4; ++l) { - for (size_t k = 0; k < 32; ++k) { - sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1); - } - } - } - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - sumf += (float) sumi * d; - } - - *s = sumf; -#endif -} - -void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q2_K * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_FEATURE_SVE - const int vector_length = svcntb()*8; - const svuint8_t m3s = svdup_n_u8(0x3); - const svuint32_t m4s = svdup_n_u32(0xF); - const svint32_t vzero_sv = svdup_n_s32(0); - svfloat32_t acc_sum = svdup_n_f32(0); - svbool_t pred_s32 = svptrue_pat_b32(SV_VL4); - - switch (vector_length) { - case 128: - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - svfloat32_t d_broad = svdup_n_f32((float32_t)d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); - - const uint8_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8_sv = y[i].qs; - const uint8_t * GGML_RESTRICT sc = x[i].scales; - - svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc); - const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); - - mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4); - const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); - - svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums); - svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4); - - const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2)); - - mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8); - const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); - - mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12); - const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4)); - - q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8); - q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12); - - svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2)); - - svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1)); - - acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad); - - svint32_t sumi1 = svdup_n_s32(0); - - { - const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2); - svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s)); - svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s)); - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0)); - - const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16); - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3)); - - - const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3)); - - //------------------------------- - - q2 += 32; - const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s)); - const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0)); - - const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16); - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1)); - - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3)); - - - const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1)); - - - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2)); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s)); - q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3)); - } - acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad); - } - *s = svaddv_f32(svptrue_b32(), acc_sum); - break; - - case 256: - case 512: - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - svfloat32_t d_broad = svdup_n_f32((float32_t)d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); - - const uint8_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8_sv = y[i].qs; - const uint8_t * GGML_RESTRICT sc = x[i].scales; - - const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8; - const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s)); - const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4)); - svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums); - - const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); - const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s)); - const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4)); - - svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8); - - svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2))); - - acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad); - - svint32_t sumi1 = svdup_n_s32(0); - - { - const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2); - svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s)); - svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1)); - sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s)); - q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3)); - sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s)); - q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5)); - sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s)); - q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7)); - sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2); - - q2 += 32; - - const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2); - q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s)); - q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1)); - sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s)); - q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3)); - sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s)); - q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5)); - sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1); - - q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s)); - q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7)); - sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2); - } - acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad); - } - *s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum); - break; - - default: - assert(false && "Unsupported vector length"); - break; - } - -#elif __ARM_NEON - const uint8x16_t m3 = vdupq_n_u8(0x3); - const uint8x16_t m4 = vdupq_n_u8(0xF); - - const int32x4_t vzero = vdupq_n_s32(0); - - ggml_int8x16x2_t q2bytes; - uint8_t aux[16]; - - float sum = 0; - - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint8_t * GGML_RESTRICT sc = x[i].scales; - - const uint8x16_t mins_and_scales = vld1q_u8(sc); - const uint8x16_t scales = vandq_u8(mins_and_scales, m4); - vst1q_u8(aux, scales); - - const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4); - const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); - const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}}; - const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])), - vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0]))); - const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])), - vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1]))); - sum += dmin * vaddvq_s32(vaddq_s32(s0, s1)); - - int isum = 0; - int is = 0; - -// We use this macro instead of a function call because for some reason -// the code runs 2-3% slower, even if the function is declared inline -#define MULTIPLY_ACCUM_WITH_SCALE(index)\ - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; - -#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\ - q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\ - q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\ - q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\ - MULTIPLY_ACCUM_WITH_SCALE((index)); - - for (int j = 0; j < QK_K/128; ++j) { - const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32; - - ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; - q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3)); - q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3)); - - MULTIPLY_ACCUM_WITH_SCALE(0); - - SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2); - SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4); - SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6); - - is += 8; - } - - sum += d * isum; - } - - *s = sum; - -#elif defined __AVX2__ - - const __m256i m3 = _mm256_set1_epi8(3); - const __m128i m4 = _mm_set1_epi8(0xF); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); - const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); - const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); - const __m256i mins = _mm256_cvtepi8_epi16(mins8); - const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums)); - - acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc); - - const __m256i all_scales = _mm256_cvtepi8_epi16(scales8); - const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); - const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); - const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; - - __m256i sumi = _mm256_setzero_si256(); - - for (int j = 0; j < QK_K/128; ++j) { - - const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32; - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - - const __m256i q2_0 = _mm256_and_si256(q2bits, m3); - const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3); - const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3); - const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3); - - __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0); - __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1); - __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2); - __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3); - - p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0); - p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1); - p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2); - p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3); - - p0 = _mm256_add_epi32(p0, p1); - p2 = _mm256_add_epi32(p2, p3); - - sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2)); - } - - acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); - - } - - *s = hsum_float_8(acc); - -#elif defined __AVX__ - - const __m128i m3 = _mm_set1_epi8(0x3); - const __m128i m4 = _mm_set1_epi8(0xF); - const __m128i m2 = _mm_set1_epi8(0x2); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - // load mins and scales from block_q2_K.scales[QK_K/16] - const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); - const __m128i scales16 = _mm_and_si128(mins_and_scales, m4); - const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); - const __m128i mins_0 = _mm_cvtepi8_epi16(mins16); - const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16)); - - // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2 - const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0])); - const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8])); - - // sumf += -dmin * summs in 32bits*8 - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc); - - const __m128i scales_0 = _mm_cvtepi8_epi16(scales16); - const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16)); - const __m128i scales[2] = { scales_0, scales_1 }; - - __m128i sumi_0 = _mm_setzero_si128(); - __m128i sumi_1 = _mm_setzero_si128(); - - for (int j = 0; j < QK_K/128; ++j) { - - // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K] - const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - - // load 2bits*16*8 from block_q2_K.qs[QK_K/4] - __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; - const __m128i q2_0 = _mm_and_si128(q2bits, m3); - const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); - const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); - const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); - q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; - const __m128i q2_1 = _mm_and_si128(q2bits, m3); - const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); - const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); - const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); - - // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8 - __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0); - __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1); - __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2); - __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3); - __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4); - __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5); - __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6); - __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7); - - // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8 - __m128i shuffle = _mm_set1_epi16(0x0100); - p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0); - shuffle = _mm_add_epi16(shuffle, m2); - p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1); - shuffle = _mm_add_epi16(shuffle, m2); - p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2); - shuffle = _mm_add_epi16(shuffle, m2); - p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3); - shuffle = _mm_add_epi16(shuffle, m2); - p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4); - shuffle = _mm_add_epi16(shuffle, m2); - p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5); - shuffle = _mm_add_epi16(shuffle, m2); - p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6); - shuffle = _mm_add_epi16(shuffle, m2); - p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7); - - p0 = _mm_add_epi32(p0, p1); - p2 = _mm_add_epi32(p2, p3); - p4 = _mm_add_epi32(p4, p5); - p6 = _mm_add_epi32(p6, p7); - - // isum in 32bits*4*2 - sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2)); - sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6)); - } - - // sumf += dall * isum - dmin * summs in 32bits - __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc); - } - - *s = hsum_float_8(acc); - -#elif defined __wasm_simd128__ - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - const uint8_t * q2 = x[i].qs; - const int8_t * q8 = y[i].qs; - const uint8_t * sc = x[i].scales; - - // Vectorized summs calculation - v128_t summs_vec = wasm_i32x4_splat(0); - { - v128_t sc_vec = wasm_v128_load(sc); - v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4); - - v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper); - v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper); - - v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]); - v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]); - - summs_vec = wasm_i32x4_add( - wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1), - wasm_i32x4_dot_i16x8(sc_high, bsums2)), - summs_vec - ); - - summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1)); - summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2)); - } - int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0); - - // Vectorized isum calculation - int32_t isum = 0; - const uint8_t * sc_ptr = sc; - const int k_iters = QK_K/128; - - for (int k = 0; k < k_iters; ++k) { - v128_t isum_vec = wasm_i32x4_splat(0); - int shift = 0; - - for (int j = 0; j < 4; ++j) { - const int d0 = (sc_ptr[0] & 0xF); - const int d1 = (sc_ptr[1] & 0xF); - sc_ptr += 2; - - // Process first 16 elements - v128_t q2_0 = wasm_v128_load(q2); - v128_t q8_0 = wasm_v128_load(q8); - v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift); - v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03)); - - // Process next 16 elements - v128_t q2_1 = wasm_v128_load(q2 + 16); - v128_t q8_1 = wasm_v128_load(q8 + 16); - v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift); - v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03)); - - // Calculate dot products - v128_t p0 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q8_0), - wasm_i16x8_extend_low_i8x16(q2_bits_0) - ); - v128_t p1 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q8_0), - wasm_i16x8_extend_high_i8x16(q2_bits_0) - ); - v128_t p2 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q8_1), - wasm_i16x8_extend_low_i8x16(q2_bits_1) - ); - v128_t p3 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q8_1), - wasm_i16x8_extend_high_i8x16(q2_bits_1) - ); - - // Accumulate scaled results - v128_t scaled = wasm_i32x4_add( - wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)), - wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1)) - ); - - isum_vec = wasm_i32x4_add(isum_vec, scaled); - q8 += 32; - shift += 2; - } - q2 += 32; - - // Horizontal sum of isum_vec - isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1)); - isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2)); - isum += wasm_i32x4_extract_lane(isum_vec, 0); - } - - const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; - sumf += dall * isum - dmin * summs; - } - - *s = sumf; - -#elif defined __riscv_xtheadvector - - float sumf = 0; - uint8_t atmp[16]; - - for (int i = 0; i < nb; ++i) { - const uint8_t * q2 = x[i].qs; - const int8_t * q8 = y[i].qs; - const uint8_t * sc = x[i].scales; - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - uint8_t *patmp = atmp; - int vsums; - int tmp; - __asm__ __volatile__( - "th.vsetvli zero, %[vl16], e8, m1\n\t" - "th.vmv.v.x v8, zero\n\t" - "th.vlb.v v1, (%[sc])\n\t" - "th.vand.vi v0, v1, 0xF\n\t" - "th.vsrl.vi v1, v1, 4\n\t" - "th.vsb.v v0, (%[scale])\n\t" - "th.vwaddu.vx v16, v1, zero\n\t" - "th.vsetvli zero, %[vl16], e16, m2\n\t" - "th.vlh.v v2, (%[bsums])\n\t" - "th.vwmul.vv v4, v16, v2\n\t" - "th.vsetvli zero, %[vl16], e32, m4\n\t" - "th.vredsum.vs v8, v4, v8\n\t" - "th.vmv.x.s %[vsums], v8" - : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums) - : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums) - , [vl16] "r" (16) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - sumf += dmin * vsums; - int isum = 0; - - for (int j = 0; j < QK_K/128; ++j) { - __asm__ __volatile__( - "th.vsetvli zero, %[vl32], e8, m2\n\t" - "th.vlb.v v0, (%[q2])\n\t" - "th.vsrl.vi v2, v0, 2\n\t" - "th.vsrl.vi v4, v0, 4\n\t" - "th.vsrl.vi v6, v0, 6\n\t" - "th.vand.vi v0, v0, 0x3\n\t" - "th.vand.vi v2, v2, 0x3\n\t" - "th.vand.vi v4, v4, 0x3\n\t" - "th.vsetvli zero, %[vl128], e8, m8\n\t" - "th.vlb.v v8, (%[q8])\n\t" - "th.vsetvli zero, %[vl64], e8, m4\n\t" - "th.vwmul.vv v16, v0, v8\n\t" - "th.vwmul.vv v24, v4, v12\n\t" - "th.vsetvli zero, %[vl16], e16, m2\n\t" - "th.vmv.v.x v0, zero\n\t" - "th.vwredsum.vs v10, v16, v0\n\t" - "th.vwredsum.vs v9, v18, v0\n\t" - "th.vwredsum.vs v8, v20, v0\n\t" - "th.vwredsum.vs v7, v22, v0\n\t" - "th.vwredsum.vs v11, v24, v0\n\t" - "th.vwredsum.vs v12, v26, v0\n\t" - "th.vwredsum.vs v13, v28, v0\n\t" - "th.vwredsum.vs v14, v30, v0\n\t" - "li %[tmp], 4\n\t" - "th.vsetvli zero, %[tmp], e32, m1\n\t" - "th.vslideup.vi v10, v9, 1\n\t" - "th.vslideup.vi v8, v7, 1\n\t" - "th.vslideup.vi v11, v12, 1\n\t" - "th.vslideup.vi v13, v14, 1\n\t" - "th.vslideup.vi v10, v8, 2\n\t" - "th.vslideup.vi v11, v13, 2\n\t" - "li %[tmp], 8\n\t" - "th.vsetvli zero, %[tmp], e32, m2\n\t" - "th.vlbu.v v12, (%[scale])\n\t" - "th.vmul.vv v10, v10, v12\n\t" - "th.vredsum.vs v0, v10, v0\n\t" - "th.vmv.x.s %[tmp], v0\n\t" - "add %[isum], %[isum], %[tmp]" - : [tmp] "=&r" (tmp), [isum] "+&r" (isum) - : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8) - , [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - q2 += 32; q8 += 128; patmp += 8; - } - - sumf += dall * isum; - } - - *s = sumf; - -#elif defined __riscv_v - - float sumf = 0; - uint8_t atmp[16]; - - const int vector_length = __riscv_vlenb() * 8; - uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; - - switch (vector_length) { - case 256: - for (int i = 0; i < nb; ++i) { - const uint8_t * q2 = x[i].qs; - const int8_t * q8 = y[i].qs; - const uint8_t * sc = x[i].scales; - - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - size_t vl = 16; - - vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl); - vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl); - - vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl); - - vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl); - vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl); - vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl)); - vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl); - vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); - - sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums); - - vl = 32; - - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); - vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl); - - uint8_t is = 0; - int isum = 0; - - for (int j = 0; j < QK_K / 128; ++j) { - // load Q2 - vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl); - - vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl); - vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03, vl); - vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03, vl); - vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03, vl); - - // duplicate scale elements for product - vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0 + is, vl), vl); - vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2 + is, vl), vl); - vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4 + is, vl), vl); - vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6 + is, vl), vl); - - vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl)); - vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl)); - vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl)); - vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl)); - - // load Q8 - vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl); - vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8 + 32, vl); - vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8 + 64, vl); - vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8 + 96, vl); - - vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl); - vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl); - vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl); - vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl); - - vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl); - vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl); - - isum += __riscv_vmv_x_s_i32m1_i32(isum1); - - q2 += 32; - q8 += 128; - is = 8; - } - - sumf += dall * isum; - } - break; - case 128: - for (int i = 0; i < nb; ++i) { - const uint8_t * q2 = x[i].qs; - const int8_t * q8 = y[i].qs; - const uint8_t * sc = x[i].scales; - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - uint8_t *patmp = atmp; - int vsums; - int tmp; - __asm__ __volatile__( - "vsetivli zero, 16, e8, m1\n\t" - "vmv.v.x v8, zero\n\t" - "vle8.v v1, (%[sc])\n\t" - "vand.vi v0, v1, 0xF\n\t" - "vsrl.vi v1, v1, 4\n\t" - "vse8.v v0, (%[scale])\n\t" - "vsetivli zero, 16, e16, m2\n\t" - "vle16.v v2, (%[bsums])\n\t" - "vzext.vf2 v0, v1\n\t" - "vwmul.vv v4, v0, v2\n\t" - "vsetivli zero, 16, e32, m4\n\t" - "vredsum.vs v8, v4, v8\n\t" - "vmv.x.s %[vsums], v8" - : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums) - : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - sumf += dmin * vsums; - int isum = 0; - - for (int j = 0; j < QK_K/128; ++j) { - __asm__ __volatile__( - "vsetvli zero, %[vl32], e8, m2\n\t" - "vle8.v v0, (%[q2])\n\t" - "vsrl.vi v2, v0, 2\n\t" - "vsrl.vi v4, v0, 4\n\t" - "vsrl.vi v6, v0, 6\n\t" - "vand.vi v0, v0, 0x3\n\t" - "vand.vi v2, v2, 0x3\n\t" - "vand.vi v4, v4, 0x3\n\t" - "vsetvli zero, %[vl128], e8, m8\n\t" - "vle8.v v8, (%[q8])\n\t" - "vsetvli zero, %[vl64], e8, m4\n\t" - "vwmul.vv v16, v0, v8\n\t" - "vwmul.vv v24, v4, v12\n\t" - "vsetivli zero, 16, e16, m2\n\t" - "vmv.v.x v0, zero\n\t" - "vwredsum.vs v10, v16, v0\n\t" - "vwredsum.vs v9, v18, v0\n\t" - "vwredsum.vs v8, v20, v0\n\t" - "vwredsum.vs v7, v22, v0\n\t" - "vwredsum.vs v11, v24, v0\n\t" - "vwredsum.vs v12, v26, v0\n\t" - "vwredsum.vs v13, v28, v0\n\t" - "vwredsum.vs v14, v30, v0\n\t" - "vsetivli zero, 4, e32, m1\n\t" - "vslideup.vi v10, v9, 1\n\t" - "vslideup.vi v8, v7, 1\n\t" - "vslideup.vi v11, v12, 1\n\t" - "vslideup.vi v13, v14, 1\n\t" - "vslideup.vi v10, v8, 2\n\t" - "vslideup.vi v11, v13, 2\n\t" - "vsetivli zero, 8, e32, m2\n\t" - "vle8.v v15, (%[scale])\n\t" - "vzext.vf4 v12, v15\n\t" - "vmul.vv v10, v10, v12\n\t" - "vredsum.vs v0, v10, v0\n\t" - "vmv.x.s %[tmp], v0\n\t" - "add %[isum], %[isum], %[tmp]" - : [tmp] "=&r" (tmp), [isum] "+&r" (isum) - : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8) - , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - q2 += 32; q8 += 128; patmp += 8; - } - - sumf += dall * isum; - } - break; - default: - assert(false && "Unsupported vector length"); - break; - } - - *s = sumf; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0x3); - const vector signed char lowScaleMask = vec_splats((signed char)0xF); - const vector int v0 = vec_splats((int32_t)0); - const vector unsigned char v2 = vec_splats((unsigned char)0x2); - const vector unsigned char v6 = vec_splats((unsigned char)0x6); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); - vector float vdmin = vec_mul(vxmin, vyd); - - vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); - vector signed short q8ysums1 = vec_xl(16, y[i].bsums); - - vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales); - vector signed char vscales = vec_and(q2xmins, lowScaleMask); - - q2xmins = vec_sr(q2xmins, v4); - vector signed short q2xmins0 = vec_unpackh(q2xmins); - vector signed short q2xmins1 = vec_unpackl(q2xmins); - - vector signed int prod0 = vec_mule(q2xmins0, q8ysums0); - vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0); - vector signed int prod2 = vec_mule(q2xmins1, q8ysums1); - vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1); - - vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); - vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); - vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); - vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - vector signed int vsumi4 = v0; - vector signed int vsumi5 = v0; - vector signed int vsumi6 = v0; - vector signed int vsumi7 = v0; - - const uint8_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - for (int j = 0; j < QK_K/128; ++j) { - __builtin_prefetch(q2, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, q2); - vector signed char qxs1 = (vector signed char)vec_xl(16, q2); - q2 += 32; - - vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask); - vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask); - vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask); - vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask); - vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask); - vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask); - vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask); - vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask); - - vector signed char q8y00 = vec_xl( 0, q8); - vector signed char q8y10 = vec_xl( 16, q8); - vector signed char q8y01 = vec_xl( 32, q8); - vector signed char q8y11 = vec_xl( 48, q8); - vector signed char q8y02 = vec_xl( 64, q8); - vector signed char q8y12 = vec_xl( 80, q8); - vector signed char q8y03 = vec_xl( 96, q8); - vector signed char q8y13 = vec_xl(112, q8); - q8 += 128; - - vector signed int qv0 = vec_msum(q8y00, q2x00, v0); - vector signed int qv1 = vec_msum(q8y01, q2x01, v0); - vector signed int qv2 = vec_msum(q8y02, q2x02, v0); - vector signed int qv3 = vec_msum(q8y03, q2x03, v0); - vector signed int qv4 = vec_msum(q8y10, q2x10, v0); - vector signed int qv5 = vec_msum(q8y11, q2x11, v0); - vector signed int qv6 = vec_msum(q8y12, q2x12, v0); - vector signed int qv7 = vec_msum(q8y13, q2x13, v0); - - vector signed short vscales_07 = vec_unpackh(vscales); - vector signed int vscales_03 = vec_unpackh(vscales_07); - vector signed int vscales_47 = vec_unpackl(vscales_07); - vector signed int vs0 = vec_splat(vscales_03, 0); - vector signed int vs1 = vec_splat(vscales_03, 1); - vector signed int vs2 = vec_splat(vscales_03, 2); - vector signed int vs3 = vec_splat(vscales_03, 3); - vector signed int vs4 = vec_splat(vscales_47, 0); - vector signed int vs5 = vec_splat(vscales_47, 1); - vector signed int vs6 = vec_splat(vscales_47, 2); - vector signed int vs7 = vec_splat(vscales_47, 3); - vscales = vec_sld(vscales, vscales, 8); - - vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0); - vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1); - vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2); - vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3); - vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4); - vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5); - vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6); - vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7); - } - - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - __m256 acc = (__m256)__lasx_xvldi(0); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0); - const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf); - const __m256i mins = lasx_ext8_16(__lsx_vsrli_b(mins_and_scales128, 4)); - const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0)); - - acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc); - - const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); - - __m256i sumi = __lasx_xvldi(0); - - for (int j = 0; j < QK_K/128; ++j) { - - const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32; - - const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - - const __m256i q2_0 = __lasx_xvandi_b(q2bits, 3); - const __m256i q2_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 2), 3); - const __m256i q2_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 4), 3); - const __m256i q2_3 = __lasx_xvsrli_b(q2bits, 6); - - __m256i p0 = lasx_madd_h_b(q2_0, q8_0); - __m256i p1 = lasx_madd_h_b(q2_1, q8_1); - __m256i p2 = lasx_madd_h_b(q2_2, q8_2); - __m256i p3 = lasx_madd_h_b(q2_3, q8_3); - - p0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p0); - p1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p1); - p2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p2); - p3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p3); - - p0 = __lasx_xvadd_w(p0, p1); - p2 = __lasx_xvadd_w(p2, p3); - - sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2)); - } - - acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); - - } - - *s = hsum_float_8(acc); - -#else - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const uint8_t * q2 = x[i].qs; - const int8_t * q8 = y[i].qs; - const uint8_t * sc = x[i].scales; - - int summs = 0; - for (int j = 0; j < 16; ++j) { - summs += y[i].bsums[j] * (sc[j] >> 4); - } - - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - int isum = 0; - int is = 0; - int d; - for (int k = 0; k < QK_K/128; ++k) { - int shift = 0; - for (int j = 0; j < 4; ++j) { - d = sc[is++] & 0xF; - int isuml = 0; - for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); - isum += d * isuml; - d = sc[is++] & 0xF; - isuml = 0; - for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); - isum += d * isuml; - shift += 2; - q8 += 32; - } - q2 += 32; - } - sumf += dall * isum - dmin * summs; - } - *s = sumf; -#endif -} - -void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const uint32_t kmask1 = 0x03030303; - const uint32_t kmask2 = 0x0f0f0f0f; - - const block_q3_K * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_FEATURE_SVE) - - uint32_t aux[3]; - uint32_t utmp[4]; - - const int8_t m32 = 32; - const int vector_length = svcntb()*8; - const svuint8_t m3b_sv = svdup_n_u8(0x3); - const svint32_t vzero_sv = svdup_n_s32(0); - - const svuint8_t m0_sv = svdup_n_u8(1); - const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1); - const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2); - const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3); - - float sum = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q3_sv = x[i].qs; - const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask; - const int8_t * GGML_RESTRICT q8_sv = y[i].qs; - - // Set up scales - memcpy(aux, x[i].scales, 12); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); - - int8_t * scale = (int8_t *)utmp; - - for (int j = 0; j < 16; ++j) scale[j] -= m32; - - switch (vector_length) { - case 128: - { - svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv); - svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16); - svuint8_t q3h_sv; - - svint32_t sumi1_1 = svdup_n_s32(0); - svint8_t q3bytes_sv; - - for (int j = 0; j < QK_K/128; ++j) { - - const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16; - const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16; - svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2); - q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0])); - - q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2); - q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1])); - - q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1); - q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2])); - - q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1); - q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3])); - - - scale += 4; - q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1); - q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0])); - - q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2); - q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1])); - - - q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16; - - q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1); - q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2])); - - q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1); - q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3])); - - if (j == 0) { - qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4); - qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4); - } - - scale += 4; - } - - sum += d * (svaddv_s32(svptrue_b32(), sumi1_1)); - } break; - case 256: - case 512: - { - svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv); - svuint8_t q3h_sv; - - svint32_t sumi1_1 = svdup_n_s32(0); - svint8_t q3bytes_sv; - - for (int j = 0; j < QK_K/128; ++j) { - - const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32; - svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2); - q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - - svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1])); - sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1); - - q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1); - q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3])); - sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1); - - scale += 4; - q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32; - - q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv); - q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1])); - sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1); - - q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1); - q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv)); - - scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3])); - sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1); - - if (j == 0) { - qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4); - } - - scale += 4; - } - - sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1)); - } break; - default: - assert(false && "Unsupported vector length"); - break; - } - } - *s = sum; - -#elif __ARM_NEON - - uint32_t aux[3]; - uint32_t utmp[4]; - - const uint8x16_t m3b = vdupq_n_u8(0x3); - const int32x4_t vzero = vdupq_n_s32(0); - - const uint8x16_t m0 = vdupq_n_u8(1); - const uint8x16_t m1 = vshlq_n_u8(m0, 1); - const uint8x16_t m2 = vshlq_n_u8(m0, 2); - const uint8x16_t m3 = vshlq_n_u8(m0, 3); - const int8_t m32 = 32; - - ggml_int8x16x4_t q3bytes; - - float sum = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].hmask; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); - - ggml_uint8x16x4_t q3h; - - int32_t isum = 0; - - // Set up scales - memcpy(aux, x[i].scales, 12); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); - - int8_t * scale = (int8_t *)utmp; - for (int j = 0; j < 16; ++j) scale[j] -= m32; - - for (int j = 0; j < QK_K/128; ++j) { - - const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32; - const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64; - const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64; - - q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2); - q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2); - q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1); - q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1); - - q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0])); - q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1])); - q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); - q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); - - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; - - scale += 4; - - q3h.val[0] = vbicq_u8(m2, qhbits.val[0]); - q3h.val[1] = vbicq_u8(m2, qhbits.val[1]); - q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1); - q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1); - - q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0])); - q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1])); - q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); - q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); - - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; - - scale += 4; - - if (j == 0) { - qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4); - qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4); - } - - } - sum += d * isum; - - } - - *s = sum; - -#elif defined __AVX2__ - - const __m256i m3 = _mm256_set1_epi8(3); - const __m256i mone = _mm256_set1_epi8(1); - const __m128i m32 = _mm_set1_epi8(32); - - __m256 acc = _mm256_setzero_ps(); - - uint32_t aux[3]; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - // Set up scales - memcpy(aux, x[i].scales, 12); - __m128i scales128 = _mm_set_epi32( - ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), - ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), - (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), - (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); - scales128 = _mm_sub_epi8(scales128, m32); - const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); - const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); - const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); - const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; - - // high bit - const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask); - - // integer accumulator - __m256i sumi = _mm256_setzero_si256(); - - int bit = 0; - int is = 0; - - for (int j = 0; j < QK_K/128; ++j) { - // load low 2 bits - const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32; - - // prepare low and high bits - const __m256i q3l_0 = _mm256_and_si256(q3bits, m3); - const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); - ++bit; - - const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3); - const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); - ++bit; - - const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3); - const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); - ++bit; - - const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3); - const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); - ++bit; - - // load Q8 quants - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - - // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, - // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, - // and 2 if the high bit was set) - __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0); - __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1); - __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2); - __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3); - - __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0); - __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1); - __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2); - __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3); - - p16_0 = _mm256_sub_epi16(p16_0, q8s_0); - p16_1 = _mm256_sub_epi16(p16_1, q8s_1); - p16_2 = _mm256_sub_epi16(p16_2, q8s_2); - p16_3 = _mm256_sub_epi16(p16_3, q8s_3); - - // multiply with scales - p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0); - p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1); - p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2); - p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3); - - // accumulate - p16_0 = _mm256_add_epi32(p16_0, p16_1); - p16_2 = _mm256_add_epi32(p16_2, p16_3); - sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2)); - - } - - // multiply with block scale and accumulate - acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); - - } - - *s = hsum_float_8(acc); - -#elif defined __AVX__ - - const __m128i m3 = _mm_set1_epi8(3); - const __m128i mone = _mm_set1_epi8(1); - const __m128i m32 = _mm_set1_epi8(32); - const __m128i m2 = _mm_set1_epi8(2); - - __m256 acc = _mm256_setzero_ps(); - - const uint32_t *aux; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - // Set up scales - aux = (const uint32_t *)x[i].scales; - __m128i scales128 = _mm_set_epi32( - ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), - ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), - (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), - (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); - scales128 = _mm_sub_epi8(scales128, m32); - const __m128i scales_0 = _mm_cvtepi8_epi16(scales128); - const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128)); - const __m128i scales[2] = { scales_0, scales_1 }; - - // high bit *128*2 from block_q3_K.hmask[QK_K/8] - const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]); - const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]); - - // integer accumulator - __m128i sumi_0 = _mm_setzero_si128(); - __m128i sumi_1 = _mm_setzero_si128(); - - for (int j = 0; j < QK_K/128; ++j) { - // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4] - const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; - const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; - - // prepare low and high bits - const int bit = j << 2; - - const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3); - const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3); - const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2); - const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2); - - const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3); - const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3); - const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2); - const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2); - - const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3); - const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3); - const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2); - const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2); - - const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3); - const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3); - const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2); - const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2); - - // load Q8 quants from block_q8_K.qs[QK_K] - const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - - // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, - // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, - // and 2 if the high bit was set) - __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0); - __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1); - __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2); - __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3); - __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4); - __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5); - __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6); - __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7); - - __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0); - __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1); - __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2); - __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3); - __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4); - __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5); - __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6); - __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7); - - p16_0 = _mm_sub_epi16(p16_0, q8s_0); - p16_1 = _mm_sub_epi16(p16_1, q8s_1); - p16_2 = _mm_sub_epi16(p16_2, q8s_2); - p16_3 = _mm_sub_epi16(p16_3, q8s_3); - p16_4 = _mm_sub_epi16(p16_4, q8s_4); - p16_5 = _mm_sub_epi16(p16_5, q8s_5); - p16_6 = _mm_sub_epi16(p16_6, q8s_6); - p16_7 = _mm_sub_epi16(p16_7, q8s_7); - - // multiply with scales - __m128i shuffle = _mm_set1_epi16(0x0100); - p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0); - shuffle = _mm_add_epi16(shuffle, m2); - p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1); - shuffle = _mm_add_epi16(shuffle, m2); - p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2); - shuffle = _mm_add_epi16(shuffle, m2); - p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3); - shuffle = _mm_add_epi16(shuffle, m2); - p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4); - shuffle = _mm_add_epi16(shuffle, m2); - p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5); - shuffle = _mm_add_epi16(shuffle, m2); - p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6); - shuffle = _mm_add_epi16(shuffle, m2); - p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7); - - // accumulate - p16_0 = _mm_add_epi32(p16_0, p16_1); - p16_2 = _mm_add_epi32(p16_2, p16_3); - p16_4 = _mm_add_epi32(p16_4, p16_5); - p16_6 = _mm_add_epi32(p16_6, p16_7); - sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); - sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6)); - - } - - // multiply with block scale and accumulate - __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); - - } - - *s = hsum_float_8(acc); - -#elif defined __wasm_simd128__ - int8_t aux8[QK_K]; - float sums[8] = {0}; - uint32_t auxs[4]; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT hm = x[i].hmask; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - // Process blocks with SIMD - int8_t * a = aux8; - uint8_t m = 1; - for (int j = 0; j < QK_K; j += 128) { - for (int shift = 0; shift <= 6; shift += 2) { - v128_t v_m = wasm_i8x16_splat(m); - for (int l = 0; l < 32; l += 16) { - v128_t v_q3 = wasm_v128_load(q3 + l); - v128_t v_shift = wasm_i8x16_shr(v_q3, shift); - v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03)); - - v128_t v_hm = wasm_v128_load(hm + l); - v128_t v_mask = wasm_v128_and(v_hm, v_m); - v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0)); - - v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask))); - wasm_v128_store(a + l, v_low2); - } - a += 32; - m <<= 1; - } - q3 += 32; - } - - // Extract scales - memcpy(auxs, x[i].scales, 12); - uint32_t tmp = auxs[2]; - auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); - auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); - auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); - auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); - const int8_t * scales = (const int8_t *)auxs; - - // SIMD dot product with register accumulators - v128_t v_acc0 = wasm_i32x4_splat(0); - v128_t v_acc1 = wasm_i32x4_splat(0); - a = aux8; - for (int j = 0; j < QK_K/16; ++j) { - const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32); - - // Process 16 elements per iteration - for (int k = 0; k < 2; ++k) { - const v128_t v_q8 = wasm_i16x8_load8x8(q8); - const v128_t v_a = wasm_i16x8_load8x8(a); - - v128_t v_prod = wasm_i16x8_mul(v_q8, v_a); - v_prod = wasm_i16x8_mul(v_prod, v_scale); - - v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod)); - v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod)); - - q8 += 8; - a += 8; - } - } - - // Accumulate results - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const v128_t v_d = wasm_f32x4_splat(d); - v128_t v_sum = wasm_f32x4_add( - wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d), - wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d) - ); - - // Accumulate into sums vector - wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum)); - } - - // Horizontal sum - v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4)); - sumf = wasm_f32x4_extract_lane(v_sum, 0) + - wasm_f32x4_extract_lane(v_sum, 1) + - wasm_f32x4_extract_lane(v_sum, 2) + - wasm_f32x4_extract_lane(v_sum, 3); - - *s = sumf; - -#elif defined __riscv_xtheadvector - - uint32_t utmp[4]; - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - - int8_t * scale = (int8_t *)utmp; - int tmp; - __asm__ __volatile__( - "li %[tmp], 12\n\t" - "th.vsetvli zero, %[tmp], e8, m1\n\t" - "th.vlb.v v0, (%[s6b])\n\t" - "th.vmv.v.v v2, v0\n\t" - "li %[tmp], 2\n\t" - "th.vsetvli zero, %[tmp], e64, m1\n\t" - "th.vmv.v.x v9, %[sh]\n\t"\ - "th.vslidedown.vi v1, v0, 1\n\t" - "th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4} - "th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]} - "li %[tmp], 4\n\t" - "th.vsetvli zero, %[tmp], e32, m1\n\t" - "th.vid.v v9\n\t" - "th.vmv.x.s %[tmp], v1\n\t" - "th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6} - "th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]} - "th.vsrl.vv v4, v1, v9\n\t" - "th.vsrl.vv v2, v0, v8\n\t" - "th.vand.vx v5, v4, %[kmask1]\n\t" - "th.vand.vx v3, v2, %[kmask2]\n\t" - "th.vsll.vi v6, v5, 4\n\t" - "th.vor.vv v7, v6, v3\n\t" - "li %[tmp], 16\n\t" - "th.vsetvli zero, %[tmp], e8, m1\n\t" - "th.vsub.vx v0, v7, %[c]\n\t" - "th.vsb.v v0, (%[scale])" - : [tmp] "=&r" (tmp) - : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32) - , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - - uint8_t m = 1; - int isum = 0; - for (int j = 0; j < QK_K; j += 128) { - __asm__ __volatile__( - // fixme: use v0p7 mask layout directly - "th.vsetvli zero, %[vl32], e8, m2\n\t" - "th.vlb.v v8, (%[q3])\n\t" - "th.vsrl.vi v10, v8, 2\n\t" - "th.vsrl.vi v12, v8, 4\n\t" - "th.vsrl.vi v14, v8, 6\n\t" - "th.vand.vi v8, v8, 3\n\t" - "th.vand.vi v10, v10, 3\n\t" - "th.vand.vi v12, v12, 3\n\t" - "th.vlb.v v2, (%[qh])\n\t" - "th.vand.vx v4, v2, %[m]\n\t" - "slli %[m], %[m], 1\n\t" - "th.vmseq.vx v0, v4, zero\n\t" - "th.vadd.vi v8, v8, -4, v0.t\n\t" - "th.vand.vx v4, v2, %[m]\n\t" - "slli %[m], %[m], 1\n\t" - "th.vmseq.vx v0, v4, zero\n\t" - "th.vadd.vi v10, v10, -4, v0.t\n\t" - "th.vand.vx v4, v2, %[m]\n\t" - "slli %[m], %[m], 1\n\t" - "th.vmseq.vx v0, v4, zero\n\t" - "th.vadd.vi v12, v12, -4, v0.t\n\t" - "th.vand.vx v4, v2, %[m]\n\t" - "slli %[m], %[m], 1\n\t" - "th.vmseq.vx v0, v4, zero\n\t" - "th.vadd.vi v14, v14, -4, v0.t\n\t" - "th.vsetvli zero, %[vl128], e8, m8\n\t" - "th.vlb.v v0, (%[q8])\n\t" - "th.vsetvli zero, %[vl64], e8, m4\n\t" - "th.vwmul.vv v16, v0, v8\n\t" - "th.vwmul.vv v24, v4, v12\n\t" - "li %[tmp], 16\n\t" - "th.vsetvli zero, %[tmp], e16, m2\n\t" - "th.vmv.v.x v0, zero\n\t" - "th.vwredsum.vs v10, v16, v0\n\t" - "th.vwredsum.vs v9, v18, v0\n\t" - "th.vwredsum.vs v8, v20, v0\n\t" - "th.vwredsum.vs v7, v22, v0\n\t" - "th.vwredsum.vs v11, v24, v0\n\t" - "th.vwredsum.vs v12, v26, v0\n\t" - "th.vwredsum.vs v13, v28, v0\n\t" - "th.vwredsum.vs v14, v30, v0\n\t" - "li %[tmp], 4\n\t" - "th.vsetvli zero, %[tmp], e32, m1\n\t" - "th.vslideup.vi v10, v9, 1\n\t" - "th.vslideup.vi v8, v7, 1\n\t" - "th.vslideup.vi v11, v12, 1\n\t" - "th.vslideup.vi v13, v14, 1\n\t" - "th.vslideup.vi v10, v8, 2\n\t" - "th.vslideup.vi v11, v13, 2\n\t" - "li %[tmp], 8\n\t" - "th.vsetvli zero, %[tmp], e32, m2\n\t" - "th.vlb.v v12, (%[scale])\n\t" - "th.vmul.vv v10, v10, v12\n\t" - "th.vredsum.vs v0, v10, v0\n\t" - "th.vmv.x.s %[tmp], v0\n\t" - "add %[isum], %[isum], %[tmp]" - : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum) - : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32) - , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - q3 += 32; q8 += 128; scale += 8; - } - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - sumf += d * isum; - } - - *s = sumf; - -#elif defined __riscv_v - - uint32_t utmp[4]; - float sumf = 0; - uint32_t aux[3]; - const int vector_length = __riscv_vlenb() * 8; - - switch (vector_length) { - case 256: - for (int i = 0; i < nb; ++i) { - - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].hmask; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(aux, x[i].scales, 12); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); - - int8_t * scale = (int8_t *)utmp; - for (int j = 0; j < 16; ++j) scale[j] -= 32; - - - size_t vl = 32; - uint8_t m = 1; - - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); - vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl); - - int sum_t = 0; - - for (int j = 0; j < QK_K; j += 128) { - - vl = 32; - - // load Q3 - vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl); - - vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl)); - vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl)); - vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl)); - vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl)); - - // compute mask for subtraction - vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl); - vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl); - vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl); - m <<= 1; - - vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); - vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl); - vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl); - m <<= 1; - - vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); - vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl); - vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl); - m <<= 1; - - vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl); - vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl); - vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl); - m <<= 1; - - // load Q8 and take product with Q3 - vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl); - vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl); - vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl); - vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl); - - vl = 16; - - // retrieve lane to multiply with scale - vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl); - vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl); - vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl); - vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl); - vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl); - vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl); - vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl); - vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl); - - vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl); - vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl); - vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl); - vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl); - - sum_t += __riscv_vmv_x_s_i32m1_i32(isum3); - - q3 += 32; q8 += 128; scale += 8; - - } - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - - sumf += d*sum_t; - - } - break; - case 128: - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - - int8_t * scale = (int8_t *)utmp; - int tmp; - __asm__ __volatile__( - "vsetivli zero, 12, e8, m1\n\t" - "vle8.v v0, (%[s6b])\n\t" - "vmv1r.v v2, v0\n\t" - "vsetivli zero, 2, e64, m1\n\t" - "vmv.v.x v9, %[sh]\n\t"\ - "vslidedown.vi v1, v0, 1\n\t" - "vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4} - "vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]} - "vsetivli zero, 4, e32, m1\n\t" - "vid.v v9\n\t" - "vmv.x.s %[tmp], v1\n\t" - "vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6} - "vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]} - "vsrl.vv v4, v1, v9\n\t" - "vsrl.vv v2, v0, v8\n\t" - "vand.vx v5, v4, %[kmask1]\n\t" - "vand.vx v3, v2, %[kmask2]\n\t" - "vsll.vi v6, v5, 4\n\t" - "vor.vv v7, v6, v3\n\t" - "vsetivli zero, 16, e8, m1\n\t" - "vsub.vx v0, v7, %[c]\n\t" - "vse8.v v0, (%[scale])" - : [tmp] "=&r" (tmp) - : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32) - , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - - uint8_t m = 1; - int isum = 0; - for (int j = 0; j < QK_K; j += 128) { - __asm__ __volatile__( - "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t" - "vle8.v v8, (%[q3])\n\t" - "vsrl.vi v10, v8, 2\n\t" - "vsrl.vi v12, v8, 4\n\t" - "vsrl.vi v14, v8, 6\n\t" - "vand.vi v8, v8, 3\n\t" - "vand.vi v10, v10, 3\n\t" - "vand.vi v12, v12, 3\n\t" - "vle8.v v2, (%[qh])\n\t" - "vand.vx v4, v2, %[m]\n\t" - "slli %[m], %[m], 1\n\t" - "vmseq.vx v0, v4, zero\n\t" - "vadd.vi v8, v8, -4, v0.t\n\t" - "vand.vx v4, v2, %[m]\n\t" - "slli %[m], %[m], 1\n\t" - "vmseq.vx v0, v4, zero\n\t" - "vadd.vi v10, v10, -4, v0.t\n\t" - "vand.vx v4, v2, %[m]\n\t" - "slli %[m], %[m], 1\n\t" - "vmseq.vx v0, v4, zero\n\t" - "vadd.vi v12, v12, -4, v0.t\n\t" - "vand.vx v4, v2, %[m]\n\t" - "slli %[m], %[m], 1\n\t" - "vmseq.vx v0, v4, zero\n\t" - "vadd.vi v14, v14, -4, v0.t\n\t" - "vsetvli zero, %[vl128], e8, m8\n\t" - "vle8.v v0, (%[q8])\n\t" - "vsetvli zero, %[vl64], e8, m4\n\t" - "vwmul.vv v16, v0, v8\n\t" - "vwmul.vv v24, v4, v12\n\t" - "vsetivli zero, 16, e16, m2\n\t" - "vmv.v.x v0, zero\n\t" - "vwredsum.vs v10, v16, v0\n\t" - "vwredsum.vs v9, v18, v0\n\t" - "vwredsum.vs v8, v20, v0\n\t" - "vwredsum.vs v7, v22, v0\n\t" - "vwredsum.vs v11, v24, v0\n\t" - "vwredsum.vs v12, v26, v0\n\t" - "vwredsum.vs v13, v28, v0\n\t" - "vwredsum.vs v14, v30, v0\n\t" - "vsetivli zero, 4, e32, m1\n\t" - "vslideup.vi v10, v9, 1\n\t" - "vslideup.vi v8, v7, 1\n\t" - "vslideup.vi v11, v12, 1\n\t" - "vslideup.vi v13, v14, 1\n\t" - "vslideup.vi v10, v8, 2\n\t" - "vslideup.vi v11, v13, 2\n\t" - "vsetivli zero, 8, e32, m2\n\t" - "vle8.v v15, (%[scale])\n\t" - "vsext.vf4 v12, v15\n\t" - "vmul.vv v10, v10, v12\n\t" - "vredsum.vs v0, v10, v0\n\t" - "vmv.x.s %[tmp], v0\n\t" - "add %[isum], %[isum], %[tmp]" - : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum) - : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32) - , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - q3 += 32; q8 += 128; scale += 8; - } - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - sumf += d * isum; - } - break; - default: - assert(false && "Unsupported vector length"); - break; - } - - *s = sumf; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0x3); - const vector signed char lowMask1 = vec_splats((int8_t)0xf); - const vector signed char lowMask2 = vec_splats((int8_t)0x30); - const vector int v0 = vec_splats((int32_t)0); - const vector signed char v1 = vec_splats((signed char)0x1); - const vector unsigned char v2 = vec_splats((unsigned char)0x2); - const vector unsigned char v3 = vec_splats((unsigned char)0x3); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - const vector unsigned char v6 = vec_splats((unsigned char)0x6); - const vector signed char off = vec_splats((signed char)0x20); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - UNUSED(kmask1); - UNUSED(kmask2); - - vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); - vector signed char u1 = vec_and(u0, lowMask1); - vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); - vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2)); - vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4); - vector signed char u31 = vec_and(u3, lowMask2); - - u1 = vec_or(u1, u30); - u2 = vec_or(vec_sr(u0, v4), u31); - - vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2); - vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask); - vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask); - - vscales = vec_sub(vscales, off); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - vector signed int vsumi4 = v0; - vector signed int vsumi5 = v0; - vector signed int vsumi6 = v0; - vector signed int vsumi7 = v0; - - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - for (int j = 0; j < QK_K/128; ++j) { - __builtin_prefetch(q3, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, q3); - vector signed char qxs1 = (vector signed char)vec_xl(16, q3); - q3 += 32; - - //the low 2 bits - vector signed char qxs00 = vec_and(qxs0, lowMask); - vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask); - vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask); - vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask); - vector signed char qxs10 = vec_and(qxs1, lowMask); - vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask); - vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask); - vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask); - - //the 3rd bit - vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2); - vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2); - vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2); - vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2); - vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2); - vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2); - vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2); - vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2); - qxhs0 = vec_sr(qxhs0, v4); - qxhs1 = vec_sr(qxhs1, v4); - - vector signed char q3x00 = vec_sub(qxs00, qxh00); - vector signed char q3x01 = vec_sub(qxs01, qxh01); - vector signed char q3x02 = vec_sub(qxs02, qxh02); - vector signed char q3x03 = vec_sub(qxs03, qxh03); - vector signed char q3x10 = vec_sub(qxs10, qxh10); - vector signed char q3x11 = vec_sub(qxs11, qxh11); - vector signed char q3x12 = vec_sub(qxs12, qxh12); - vector signed char q3x13 = vec_sub(qxs13, qxh13); - - vector signed char q8y00 = vec_xl( 0, q8); - vector signed char q8y10 = vec_xl( 16, q8); - vector signed char q8y01 = vec_xl( 32, q8); - vector signed char q8y11 = vec_xl( 48, q8); - vector signed char q8y02 = vec_xl( 64, q8); - vector signed char q8y12 = vec_xl( 80, q8); - vector signed char q8y03 = vec_xl( 96, q8); - vector signed char q8y13 = vec_xl(112, q8); - q8 += 128; - - vector signed short vscales_h = vec_unpackh(vscales); - vector signed short vs0 = vec_splat(vscales_h, 0); - vector signed short vs1 = vec_splat(vscales_h, 1); - vector signed short vs2 = vec_splat(vscales_h, 2); - vector signed short vs3 = vec_splat(vscales_h, 3); - vector signed short vs4 = vec_splat(vscales_h, 4); - vector signed short vs5 = vec_splat(vscales_h, 5); - vector signed short vs6 = vec_splat(vscales_h, 6); - vector signed short vs7 = vec_splat(vscales_h, 7); - vscales = vec_sld(vscales, vscales, 8); - - vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00)); - vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01)); - vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02)); - vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03)); - vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10)); - vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11)); - vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12)); - vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13)); - - vsumi0 = vec_msum(qv00, vs0, vsumi0); - vsumi1 = vec_msum(qv01, vs2, vsumi1); - vsumi2 = vec_msum(qv02, vs4, vsumi2); - vsumi3 = vec_msum(qv03, vs6, vsumi3); - vsumi4 = vec_msum(qv10, vs1, vsumi4); - vsumi5 = vec_msum(qv11, vs3, vsumi5); - vsumi6 = vec_msum(qv12, vs5, vsumi6); - vsumi7 = vec_msum(qv13, vs7, vsumi7); - } - - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - const __m128i m32 = __lsx_vreplgr2vr_b(32); - - __m256 acc = (__m256)__lasx_xvldi(0); - - uint32_t aux[3]; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - // Set up scales - memcpy(aux, x[i].scales, 12); - __m128i scales128 = lsx_set_w( - ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), - ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), - (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), - (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); - scales128 = __lsx_vsub_b(scales128, m32); - - const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); - - // high bit - const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0); - - // integer accumulator - __m256i sumi = __lasx_xvldi(0); - - for (int j = 0; j < QK_K/128; ++j) { - // load low 2 bits - const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32; - - // prepare low and high bits - const __m256i q3l_0 = __lasx_xvandi_b(q3bits, 3); - const __m256i q3l_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 2), 3); - const __m256i q3l_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 4), 3); - const __m256i q3l_3 = __lasx_xvsrli_b(q3bits, 6); - const __m256i q3h_0 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 0), 0), 2); - const __m256i q3h_1 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 1), 0), 2); - const __m256i q3h_2 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 2), 0), 2); - const __m256i q3h_3 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 3), 0), 2); - const __m256i q3_0 = __lasx_xvor_v(q3h_0, q3l_0); - const __m256i q3_1 = __lasx_xvor_v(q3h_1, q3l_1); - const __m256i q3_2 = __lasx_xvor_v(q3h_2, q3l_2); - const __m256i q3_3 = __lasx_xvor_v(q3h_3, q3l_3); - - // load Q8 quants - const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - - __m256i p16_0 = lasx_madd_h_b(q8_0, q3_0); - __m256i p16_1 = lasx_madd_h_b(q8_1, q3_1); - __m256i p16_2 = lasx_madd_h_b(q8_2, q3_2); - __m256i p16_3 = lasx_madd_h_b(q8_3, q3_3); - - // multiply with scales - p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0); - p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1); - p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2); - p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3); - - // accumulate - p16_0 = __lasx_xvadd_w(p16_0, p16_1); - p16_2 = __lasx_xvadd_w(p16_2, p16_3); - sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2)); - } - // multiply with block scale and accumulate - acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); - } - - *s = hsum_float_8(acc); -#elif defined(__VXE__) || defined(__VXE2__) - uint32_t aux[3]; - uint32_t utmp[4]; - - const int32x4_t v_z = vec_splat_s32(0); - const uint8x16_t v_3m = vec_splat_u8(0x03); - - const uint8x16_t v_0c = vec_splat_u8(1); - const uint8x16_t v_1c = vec_sl(v_0c, 1); - const uint8x16_t v_2c = vec_sl(v_0c, 2); - const uint8x16_t v_3c = vec_sl(v_0c, 3); - - uint8x16_t q3h[4]; - uint8x16_t q3b[2]; - int8x16_t q3bytes[4]; - int8x16_t q8bytes[4]; - uint8x16_t qhbits[2]; - - float sum = 0; - - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict x0l = x[i].qs; - const uint8_t * restrict x0h = x[i].hmask; - const int8_t * restrict y0 = y[i].qs; - - qhbits[0] = vec_xl(0 , x0h); - qhbits[1] = vec_xl(16, x0h); - - int32_t isum = 0; - - memcpy(aux, x[i].scales, 12); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); - - int8_t * scale = (int8_t *)utmp; - for (int j = 0; j < 16; ++j) scale[j] -= 32; - - for (int j = 0; j < QK_K/128; ++j) { - int32x4_t isum0, isum1, isum2, isum3; - - q3b[0] = vec_xl(0 , x0l); - q3b[1] = vec_xl(16, x0l); - x0l += 32; - - q8bytes[0] = vec_xl(0 , y0); - q8bytes[1] = vec_xl(16 , y0); - q8bytes[2] = vec_xl(32 , y0); - q8bytes[3] = vec_xl(48 , y0); - q8bytes[4] = vec_xl(64 , y0); - q8bytes[5] = vec_xl(80 , y0); - q8bytes[6] = vec_xl(96 , y0); - q8bytes[7] = vec_xl(112, y0); - y0 += 128; - - q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2); - q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2); - q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1); - q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1); - - q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]); - q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]); - q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]); - q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]); - - isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]); - isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]); - isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]); - isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]); - - isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0]; - isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1]; - isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2]; - isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3]; - - scale += 4; - - q3h[0] = vec_andc(v_2c, qhbits[0]); - q3h[1] = vec_andc(v_2c, qhbits[1]); - q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1); - q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1); - - q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]); - q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]); - q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]); - q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]); - - isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]); - isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]); - isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]); - isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]); - - isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0]; - isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1]; - isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2]; - isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3]; - - scale += 4; - - if (j == 0) { - qhbits[0] = vec_sr(qhbits[0], 4); - qhbits[1] = vec_sr(qhbits[1], 4); - } - } - - sum += d * isum; - } - - *s = sum; -#else - // scalar version - // This function is written like this so the compiler can manage to vectorize most of it - // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the - // manually vectorized version above. Every other version I tried would run at least 4 times slower. - // The ideal situation would be if we could just write the code once, and the compiler would - // automatically produce the best possible set of machine instructions, instead of us having to manually - // write vectorized versions for AVX, ARM_NEON, etc. - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); - - uint32_t auxs[4]; - const int8_t * scales = (const int8_t*)auxs; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT hm = x[i].hmask; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * GGML_RESTRICT a = aux8; - uint8_t m = 1; - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; - for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); - a += 32; m <<= 1; - for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; - for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); - a += 32; m <<= 1; - for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; - for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); - a += 32; m <<= 1; - for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; - for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); - a += 32; m <<= 1; - q3 += 32; - } - a = aux8; - - memcpy(auxs, x[i].scales, 12); - uint32_t tmp = auxs[2]; - auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); - auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); - auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); - auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); - for (int j = 0; j < QK_K/16; ++j) { - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; - q8 += 8; a += 8; - } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; - -#endif - -} - -void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); -#ifdef __ARM_FEATURE_MATMUL_INT8 - assert((nrc == 2) || (nrc == 1)); -#else - assert(nrc == 1); -#endif - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q4_K * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - - static const uint32_t kmask1 = 0x3f3f3f3f; - static const uint32_t kmask2 = 0x0f0f0f0f; - static const uint32_t kmask3 = 0x03030303; - - uint32_t utmp[4]; - -#if defined(__ARM_FEATURE_MATMUL_INT8) - if (nrc == 2) { - const block_q4_K * GGML_RESTRICT x0 = x; - const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx); - const block_q8_K * GGML_RESTRICT y0 = y; - const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by); - - const uint8x16_t m4b = vdupq_n_u8(0x0f); - - float32x4_t vfsum = vdupq_n_f32(0.0f); - - for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) { - const uint8_t * GGML_RESTRICT qx0 = x0->qs; - const uint8_t * GGML_RESTRICT qx1 = x1->qs; - const int8_t * GGML_RESTRICT qy0 = y0->qs; - const int8_t * GGML_RESTRICT qy1 = y1->qs; - - // decode scales and mins - int8_t x0_scales[8], x1_scales[8]; - int16x8_t x0_mins, x1_mins; - { - uint32_t scales_mins[3]; - memcpy(scales_mins, x0->scales, 12); - const uint32_t mins_0_3 = scales_mins[1] & kmask1; - const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4); - const uint32x2_t mins = {mins_0_3, mins_4_7}; - x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins))); - uint32_t scales[2]; - scales[0] = scales_mins[0] & kmask1; // scales 0~3 - scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7 - memcpy(x0_scales, scales, 8); - } - { - uint32_t scales_mins[3]; - memcpy(scales_mins, x1->scales, 12); - const uint32_t mins_0_3 = scales_mins[1] & kmask1; - const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4); - const uint32x2_t mins = {mins_0_3, mins_4_7}; - x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins))); - uint32_t scales[2]; - scales[0] = scales_mins[0] & kmask1; // scales 0~3 - scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7 - memcpy(x1_scales, scales, 8); - } - - int32x4_t visum = {0}; - - // process 64 data points per iteration, totally 256 data points - for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) { - const int8x16x4_t vy0 = vld1q_s8_x4(qy0); - const int8x16x4_t vy1 = vld1q_s8_x4(qy1); - - int8x16_t vx0[4], vx1[4]; - { - const uint8x16x2_t vv = vld1q_u8_x2(qx0); - vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b)); - vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b)); - vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4)); - vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4)); - } - { - const uint8x16x2_t vv = vld1q_u8_x2(qx1); - vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b)); - vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b)); - vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4)); - vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4)); - } - - // process 32 data points (share same block scale) per iteration - for (int k = 0; k < 2; ++k) { - const int blk = j * 2 + k; - const int32x4_t block_scale = { - x0_scales[blk], - x0_scales[blk], - x1_scales[blk], - x1_scales[blk], - }; - - int32x4_t vr = {0}; - for (int l = 0; l < 2; ++l) { - const int idx = k * 2 + l; - const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]); - const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]); - const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]); - const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]); - const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64)); - const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64)); - const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64)); - const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64)); - vr = vmmlaq_s32(vr, vx_l, vy_l); - vr = vmmlaq_s32(vr, vx_h, vy_h); - } - // apply block scale, will NOT overflow - // block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits - visum = vmlaq_s32(visum, vr, block_scale); - } - } - - // adjust bias, apply superblock scale - { - int32_t bias[4]; - // no obvious uplift from sve sdot-16, just use neon mul add - const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8)); - const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8)); - bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)), - vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins)))); - bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)), - vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins)))); - bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)), - vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins)))); - bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)), - vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins)))); - const float32x4_t dmins = { - GGML_FP16_TO_FP32(x0->dmin) * y0->d, - GGML_FP16_TO_FP32(x0->dmin) * y1->d, - GGML_FP16_TO_FP32(x1->dmin) * y0->d, - GGML_FP16_TO_FP32(x1->dmin) * y1->d, - }; - vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins); - - const float32x4_t superblock_scale = { - GGML_FP16_TO_FP32(x0->d) * y0->d, - GGML_FP16_TO_FP32(x0->d) * y1->d, - GGML_FP16_TO_FP32(x1->d) * y0->d, - GGML_FP16_TO_FP32(x1->d) * y1->d, - }; - vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale); - } - } - - // vfsum = ABCD -> ACBD - // AC -> s, BD -> (s+bs) - vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2)); - vst1_f32(s, vget_low_f32 (vfsum)); - vst1_f32(s + bs, vget_high_f32(vfsum)); - - return; - } -#endif - -#ifdef __ARM_FEATURE_SVE - float sumf = 0; - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); - - memcpy(utmp, x[i].scales, K_SCALE_SIZE); - - uint32x2_t mins8 = { 0 }; - mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0); - mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1); - - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[0] &= kmask1; - - const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8))); - const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)), - vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins))); - sumf -= dmin * vaddvq_s32(prod); - - const uint8_t * scales = (const uint8_t *)utmp; - - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const int vector_length = ggml_cpu_get_sve_cnt()*8; - const svuint8_t m4b = svdup_n_u8(0xf); - const svint32_t mzero = svdup_n_s32(0); - svint32_t sumi1 = svdup_n_s32(0); - svint32_t sumi1_1 = svdup_n_s32(0); - svint32_t sumi1_2 = svdup_n_s32(0); - svint32_t sumi2 = svdup_n_s32(0); - svint32_t sumi2_1 = svdup_n_s32(0); - svint32_t sumi2_2 = svdup_n_s32(0); - switch (vector_length) { - case 128: - { - for (int j = 0; j < QK_K/64; ++j) { - svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b)); - svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; - sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); - q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b)); - q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; - sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); - - q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4)); - q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; - sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); - q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4)); - q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16; - sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); - q4 += 32; - } - sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2); - sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2); - sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2))); - } break; - case 256: - case 512: - { - for (int j = 0; j < QK_K/64; ++j) { - const svuint8_t q4bits = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32; - svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b)); - svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32; - sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]); - - q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4)); - q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32; - sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]); - } - sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2))); - } break; - default: - assert(false && "Unsupported vector length"); - break; - } - } - *s = sumf; -#elif defined __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); - const int32x4_t mzero = vdupq_n_s32(0); - - ggml_int8x16x2_t q4bytes; - ggml_int8x16x2_t q8bytes; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); - - memcpy(utmp, x[i].scales, 12); - - uint32x2_t mins8 = { 0 }; - mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0); - mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1); - - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[0] &= kmask1; - - const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8))); - const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)), - vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins))); - sumf -= dmin * vaddvq_s32(prod); - - const uint8_t * scales = (const uint8_t *)utmp; - - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - int32_t sumi1 = 0; - int32_t sumi2 = 0; - - for (int j = 0; j < QK_K/64; ++j) { - const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32; - - q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; - q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); - q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - - const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); - sumi1 += vaddvq_s32(p1) * scales[2*j+0]; - - q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; - q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); - q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - - const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); - - sumi2 += vaddvq_s32(p2) * scales[2*j+1]; - } - - sumf += d * (sumi1 + sumi2); - - } - - *s = sumf; - -#elif defined __wasm_simd128__ - const uint8_t * scales = (const uint8_t*)&utmp[0]; - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign - - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - // Process scales and mins - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - // Sum mins * q8sums - int32_t sumi = 0; - const int16_t * GGML_RESTRICT q8sums = y[i].bsums; - const uint8_t * m = (const uint8_t *)&utmp[2]; - for (int j = 0; j < 16; j += 2) { - sumi += (q8sums[j] + q8sums[j+1]) * m[j/2]; - } - sumf -= dmin * sumi; - - int32_t sumi1 = 0; - int32_t sumi2 = 0; - - for (int j = 0; j < QK_K/64; ++j) { - // Load 64 4-bit weights (32 bytes) - const v128_t q4x0 = wasm_v128_load(q4); - const v128_t q4x1 = wasm_v128_load(q4 + 16); - q4 += 32; - - // Split into low/high nibbles - const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F)); - const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4); - const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F)); - const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4); - - // Load 64 8-bit values (64 bytes) - const v128_t q8x0 = wasm_v128_load(q8); - const v128_t q8x1 = wasm_v128_load(q8 + 16); - const v128_t q8x2 = wasm_v128_load(q8 + 32); - const v128_t q8x3 = wasm_v128_load(q8 + 48); - q8 += 64; - - // Low nibble products - v128_t vacc1 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q4l0), - wasm_i16x8_extend_low_i8x16(q8x0) - ); - vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q4l0), - wasm_i16x8_extend_high_i8x16(q8x0) - )); - vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q4l1), - wasm_i16x8_extend_low_i8x16(q8x1) - )); - vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q4l1), - wasm_i16x8_extend_high_i8x16(q8x1) - )); - - // High nibble products - v128_t vacc2 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q4h0), - wasm_i16x8_extend_low_i8x16(q8x2) - ); - vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q4h0), - wasm_i16x8_extend_high_i8x16(q8x2) - )); - vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q4h1), - wasm_i16x8_extend_low_i8x16(q8x3) - )); - vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q4h1), - wasm_i16x8_extend_high_i8x16(q8x3) - )); - - // Accumulate scaled results - int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) + - wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3); - sumi1 += vacc1_sum * scales[2*j]; - - int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) + - wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3); - sumi2 += vacc2_sum * scales[2*j+1]; - } - - sumf += d * (sumi1 + sumi2); - } - - *s = sumf; - -#elif defined __AVX2__ - - const __m256i m4 = _mm256_set1_epi8(0xF); - - __m256 acc = _mm256_setzero_ps(); - __m128 acc_m = _mm_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); - - const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums); - const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); - const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s); - acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m); - - const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); - const __m256i scales = MM256_SET_M128I(sc128, sc128); - - __m256i sumi = _mm256_setzero_si256(); - - for (int j = 0; j < QK_K/64; ++j) { - - const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0)); - const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1)); - - const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; - const __m256i q4l = _mm256_and_si256(q4bits, m4); - const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4); - - const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - __m256i p16l = _mm256_maddubs_epi16(q4l, q8l); - p16l = _mm256_madd_epi16(scale_l, p16l); - - const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - __m256i p16h = _mm256_maddubs_epi16(q4h, q8h); - p16h = _mm256_madd_epi16(scale_h, p16h); - const __m256i sumj = _mm256_add_epi32(p16l, p16h); - - sumi = _mm256_add_epi32(sumi, sumj); - } - - __m256 vd = _mm256_set1_ps(d); - acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc); - - } - - acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m)); - acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m)); - - *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); - -#elif defined __AVX__ - - const __m128i m4 = _mm_set1_epi8(0xF); - const __m128i m2 = _mm_set1_epi8(0x2); - - __m256 acc = _mm256_setzero_ps(); - __m128 acc_m = _mm_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]); - const __m128i scales = _mm_cvtepu8_epi16(utmps); - const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps)); - - const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]); - const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]); - const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1); - const __m128i prod = _mm_madd_epi16(mins, q8s); - acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m); - - __m128i sumi_0 = _mm_setzero_si128(); - __m128i sumi_1 = _mm_setzero_si128(); - - __m128i shuffle = _mm_set1_epi16(0x0100); - for (int j = 0; j < QK_K/64; ++j) { - - const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi16(shuffle, m2); - const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi16(shuffle, m2); - - __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16; - const __m128i q4l_0 = _mm_and_si128(q4bits, m4); - const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4); - q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16; - const __m128i q4l_1 = _mm_and_si128(q4bits, m4); - const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4); - - const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0); - p16l = _mm_madd_epi16(scale_l, p16l); - sumi_0 = _mm_add_epi32(sumi_0, p16l); - const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - p16l = _mm_maddubs_epi16(q4l_1, q8l_1); - p16l = _mm_madd_epi16(scale_l, p16l); - sumi_1 = _mm_add_epi32(sumi_1, p16l); - - const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0); - p16h = _mm_madd_epi16(scale_h, p16h); - sumi_0 = _mm_add_epi32(sumi_0, p16h); - const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - p16h = _mm_maddubs_epi16(q4h_1, q8h_1); - p16h = _mm_madd_epi16(scale_h, p16h); - sumi_1 = _mm_add_epi32(sumi_1, p16h); - - } - - __m256 vd = _mm256_set1_ps(d); - __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); - acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); - - } - - acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m)); - acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m)); - - *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); - -#elif defined __riscv_xtheadvector - - const uint8_t * scales = (const uint8_t*)&utmp[0]; - const uint8_t * mins = (const uint8_t*)&utmp[2]; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - int tmp, tmp2, sumi; - __asm__ __volatile__( - "li %[t1], 12\n\t" - "th.vsetvli zero, %[t1], e8, m1\n\t" - "th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]} - "li %[t1], 4\n\t" - "th.vsetvli zero, %[t1], e32, m1\n\t" - "th.vslidedown.vi v2, v1, 2\n\t" - "th.vmv.v.v v3, v2\n\t" - "th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]} - "li %[t1], 2\n\t" - "th.vsetvli zero, %[t1], e32, m1\n\t" - "th.vmv.v.i v4, 4\n\t" - "th.vand.vx v8, v1, %[kmask1]\n\t" - "th.vslide1up.vx v5, v4, zero\n\t" // {0, 4} - "th.vsrl.vi v6, v1, 6\n\t" - "th.vsrl.vv v7, v2, v5\n\t" - "th.vand.vx v0, v6, %[kmask3]\n\t" - "th.vand.vx v2, v7, %[kmask2]\n\t" - "th.vsll.vi v6, v0, 4\n\t" - "li %[t2], 8\n\t" - "addi %[t1], %[utmp], 4\n\t" - "th.vor.vv v1, v6, v2\n\t" - "th.vssw.v v8, (%[utmp]), %[t2]\n\t" - "th.vssw.v v1, (%[t1]), %[t2]\n\t" - "th.vsetvli zero, zero, e32, m2\n\t" // vl == 8 - "th.vlw.v v2, (%[bsums])\n\t" - "th.vsetvli zero, %[t2], e16, m1\n\t" - "th.vnsrl.vi v0, v2, 0\n\t" - "th.vnsrl.vi v1, v2, 16\n\t" - "th.vadd.vv v2, v0, v1\n\t" - "th.vlbu.v v4, (%[mins])\n\t" - "th.vwmul.vv v6, v4, v2\n\t" - "th.vmv.v.x v0, zero\n\t" - "th.vsetvli zero, %[t2], e32, m2\n\t" - "th.vredsum.vs v0, v6, v0\n\t" - "th.vmv.x.s %[sumi], v0" - : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi) - : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp) - , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1) - , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - sumf -= dmin * sumi; - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - sumi = 0; - const uint8_t * scale = scales; - - for (int j = 0; j < QK_K/128; ++j) { - int vl128 = 128, vl64 = 64, vl32 = 32; - __asm__ __volatile__( - "th.vsetvli zero, %[vl128], e8, m8\n\t" - "th.vlb.v v8, (%[q8])\n\t" - "th.vsetvli zero, %[vl64], e8, m4\n\t" - "th.vlb.v v0, (%[q4])\n\t" - "th.vsrl.vi v4, v0, 4\n\t" - "th.vand.vi v0, v0, 0xF\n\t" - "th.vsetvli zero, %[vl32], e8, m2\n\t" - "th.vwmul.vv v28, v6, v14\n\t" - "th.vwmul.vv v20, v4, v10\n\t" - "th.vwmul.vv v24, v2, v12\n\t" - "th.vwmul.vv v16, v0, v8\n\t" - "li %[tmp], 4\n\t" - "th.vsetvli zero, %[tmp], e32, m1\n\t" - "th.vlbu.v v1, (%[scale])\n\t" - "th.vmv.v.x v0, zero\n\t" - "th.vsetvli zero, %[vl32], e16, m4\n\t" - "th.vwredsum.vs v6, v24, v0\n\t" - "th.vwredsum.vs v7, v28, v0\n\t" - "th.vwredsum.vs v4, v16, v0\n\t" - "th.vwredsum.vs v5, v20, v0\n\t" - "th.vsetvli zero, %[tmp], e32, m1\n\t" - "th.vslideup.vi v6, v7, 1\n\t" - "th.vslideup.vi v4, v5, 1\n\t" - "th.vslideup.vi v4, v6, 2\n\t" - "th.vmul.vv v8, v4, v1\n\t" - "th.vredsum.vs v0, v8, v0\n\t" - "th.vmv.x.s %[tmp], v0\n\t" - "add %[sumi], %[sumi], %[tmp]" - : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi) - : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32) - , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - - q4 += 64; q8 += 128; scale += 4; - } - - sumf += d * sumi; - - } - - *s = sumf; - -#elif defined __riscv_v - - const uint8_t * scales = (const uint8_t*)&utmp[0]; - const uint8_t * mins = (const uint8_t*)&utmp[2]; - - float sumf = 0; - const int vector_length = __riscv_vlenb() * 8; - - switch (vector_length) { - case 256: - for (int i = 0; i < nb; ++i) { - - size_t vl = 8; - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); - vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); - vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl); - vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl)); - vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl); - - vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); - sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); - - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - vl = 32; - - int32_t sum_1 = 0; - int32_t sum_2 = 0; - - vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1); - - for (int j = 0; j < QK_K/64; ++j) { - // load Q4 - vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl); - - // load Q8 and multiply it with lower Q4 nibble - vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl); - vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl)); - vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl); - vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl); - - sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0]; - - // load Q8 and multiply it with upper Q4 nibble - vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl); - vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl)); - vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl); - vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl); - - sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1]; - - q4 += 32; q8 += 64; - - } - - sumf += d*(sum_1 + sum_2); - - } - break; - case 128: - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - int tmp, tmp2, sumi; - __asm__ __volatile__( - "vsetivli zero, 12, e8, m1\n\t" - "vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]} - "vsetivli zero, 4, e32, m1\n\t" - "vslidedown.vi v2, v1, 2\n\t" - "vmv1r.v v3, v2\n\t" - "vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]} - "vsetivli zero, 2, e32, m1\n\t" - "vmv.v.i v4, 4\n\t" - "vand.vx v8, v1, %[kmask1]\n\t" - "vslide1up.vx v5, v4, zero\n\t" // {0, 4} - "vsrl.vi v6, v1, 6\n\t" - "vsrl.vv v7, v2, v5\n\t" - "vand.vx v0, v6, %[kmask3]\n\t" - "vand.vx v2, v7, %[kmask2]\n\t" - "vsll.vi v6, v0, 4\n\t" - "li %[t2], 8\n\t" - "addi %[t1], %[utmp], 4\n\t" - "vor.vv v1, v6, v2\n\t" - "vsse32.v v8, (%[utmp]), %[t2]\n\t" - "vsse32.v v1, (%[t1]), %[t2]\n\t" - "vsetivli zero, 8, e16, m1\n\t" - "vle32.v v2, (%[bsums])\n\t" - "vnsrl.wi v0, v2, 0\n\t" - "vnsrl.wi v1, v2, 16\n\t" - "vadd.vv v2, v0, v1\n\t" - "vle8.v v3, (%[mins])\n\t" - "vzext.vf2 v4, v3\n\t" - "vwmul.vv v6, v4, v2\n\t" - "vmv.v.x v0, zero\n\t" - "vsetivli zero, 8, e32, m2\n\t" - "vredsum.vs v0, v6, v0\n\t" - "vmv.x.s %[sumi], v0" - : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi) - : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp) - , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1) - , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - sumf -= dmin * sumi; - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - sumi = 0; - const uint8_t * scale = scales; - - for (int j = 0; j < QK_K/128; ++j) { - int vl128 = 128, vl64 = 64, vl32 = 32; - __asm__ __volatile__( - "vsetvli zero, %[vl128], e8, m8\n\t" - "vle8.v v8, (%[q8])\n\t" - "vsetvli zero, %[vl64], e8, m4\n\t" - "vle8.v v0, (%[q4])\n\t" - "vsrl.vi v4, v0, 4\n\t" - "vand.vi v0, v0, 0xF\n\t" - "vsetvli zero, %[vl32], e8, m2\n\t" - "vwmul.vv v28, v6, v14\n\t" - "vwmul.vv v20, v4, v10\n\t" - "vwmul.vv v24, v2, v12\n\t" - "vwmul.vv v16, v0, v8\n\t" - "vsetivli zero, 4, e32, m1\n\t" - "vle8.v v2, (%[scale])\n\t" - "vmv.v.x v0, zero\n\t" - "vzext.vf4 v1, v2\n\t" - "vsetvli zero, %[vl32], e16, m4\n\t" - "vwredsum.vs v6, v24, v0\n\t" - "vwredsum.vs v7, v28, v0\n\t" - "vwredsum.vs v4, v16, v0\n\t" - "vwredsum.vs v5, v20, v0\n\t" - "vsetivli zero, 4, e32, m1\n\t" - "vslideup.vi v6, v7, 1\n\t" - "vslideup.vi v4, v5, 1\n\t" - "vslideup.vi v4, v6, 2\n\t" - "vmul.vv v8, v4, v1\n\t" - "vredsum.vs v0, v8, v0\n\t" - "vmv.x.s %[tmp], v0\n\t" - "add %[sumi], %[sumi], %[tmp]" - : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi) - : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32) - , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - - q4 += 64; q8 += 128; scale += 4; - } - - sumf += d * sumi; - } - break; - default: - assert(false && "Unsupported vector length"); - break; - } - - *s = sumf; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector signed char lowMask1 = vec_splats((int8_t)0x3f); - const vector signed char lowMask2 = vec_splats((int8_t)0x30); - const vector int v0 = vec_splats((int32_t)0); - const vector unsigned char v2 = vec_splats((uint8_t)2); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); - vector float vdmin = vec_mul(vxmin, vyd); - - vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); - vector signed short q8ysums1 = vec_xl(16, y[i].bsums); - - UNUSED(kmask1); - UNUSED(kmask2); - UNUSED(kmask3); - UNUSED(utmp); - - vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); - vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2); - vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); - vector signed char u3 = vec_sr(u2, v4); - - vector signed char u30 = u1; - vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3); - - u1 = vec_and(u0, lowMask1); - u2 = vec_or(u30, u31); - - vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2); - - vector signed short vscales = vec_unpackh(utmps); - vector signed short q4xmins = vec_unpackl(utmps); - vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins); - vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins); - - vector signed int prod0 = vec_mule(q4xmins0, q8ysums0); - vector signed int prod1 = vec_mule(q4xmins1, q8ysums1); - vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0); - vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1); - - vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); - vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); - vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); - vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - for (int j = 0; j < QK_K/64; j+=2) { - __builtin_prefetch(q4, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, q4); - vector signed char qxs1 = (vector signed char)vec_xl(16, q4); - vector signed char qxs2 = (vector signed char)vec_xl(32, q4); - vector signed char qxs3 = (vector signed char)vec_xl(48, q4); - q4 += 64; - - vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask); - vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4); - vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask); - vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4); - vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask); - vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4); - vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask); - vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4); - - vector signed char q8y00 = vec_xl( 0, q8); - vector signed char q8y10 = vec_xl( 16, q8); - vector signed char q8y01 = vec_xl( 32, q8); - vector signed char q8y11 = vec_xl( 48, q8); - vector signed char q8y20 = vec_xl( 64, q8); - vector signed char q8y30 = vec_xl( 80, q8); - vector signed char q8y21 = vec_xl( 96, q8); - vector signed char q8y31 = vec_xl(112, q8); - q8 += 128; - - vector signed int qv00 = vec_msum(q8y00, q4x00, v0); - vector signed int qv01 = vec_msum(q8y01, q4x01, v0); - vector signed int qv10 = vec_msum(q8y10, q4x10, v0); - vector signed int qv11 = vec_msum(q8y11, q4x11, v0); - vector signed int qv20 = vec_msum(q8y20, q4x20, v0); - vector signed int qv21 = vec_msum(q8y21, q4x21, v0); - vector signed int qv30 = vec_msum(q8y30, q4x30, v0); - vector signed int qv31 = vec_msum(q8y31, q4x31, v0); - - vector signed int vscales_h = vec_unpackh(vscales); - vector signed int vs0 = vec_splat(vscales_h, 0); - vector signed int vs1 = vec_splat(vscales_h, 1); - vector signed int vs2 = vec_splat(vscales_h, 2); - vector signed int vs3 = vec_splat(vscales_h, 3); - vscales = vec_sld(vscales, vscales, 8); - - vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0); - vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1); - vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2); - vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3); - - vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0); - vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1); - vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2); - vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - __m256 acc = (__m256)__lasx_xvldi(0); - __m128 acc_m = (__m128)__lsx_vldi(0); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]); - const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128); - const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0); - - const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0); - const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1)); - const __m128i prod = lsx_madd_h(mins128, q8s); - acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m); - - const __m256i scales = lasx_insertf128(scales128, scales128); - - __m256i sumi = __lasx_xvldi(0); - - for (int j = 0; j < QK_K/64; ++j) { - - const __m256i scale_l = lasx_xvrepl128vei_h(scales, 2 * j + 0); - const __m256i scale_h = lasx_xvrepl128vei_h(scales, 2 * j + 1); - - const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; - const __m256i q4l = __lasx_xvandi_b(q4bits, 0xf); - const __m256i q4h = __lasx_xvsrli_b(q4bits, 4); - - const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - __m256i p16l = lasx_madd_h_b(q4l, q8l); - p16l = lasx_madd_h(scale_l, p16l); - - const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - __m256i p16h = lasx_madd_h_b(q4h, q8h); - p16h = lasx_madd_h(scale_h, p16h); - const __m256i sumj = __lasx_xvadd_w(p16l, p16h); - - sumi = __lasx_xvadd_w(sumi, sumj); - } - - __m256 vd = __lasx_xvreplfr2vr_s(d); - acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); - - } - - acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee)); - __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0); - acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); - - - *s = hsum_float_8(acc) + ((v4f32)acc_m)[0]; -#elif defined(__VXE__) || defined(__VXE2__) - const uint8x16_t v_lm = vec_splat_u8(0x0F); - const int32x4_t v_z = vec_splat_s32(0); - - uint8x16_t v_x[2]; - int8x16_t v_xl[2]; - int8x16_t v_y[2]; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); - const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); - const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh); - - memcpy(utmp, x[i].scales, 12); - - uint32x4_t v_mins8 = { 0 }; - v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0); - v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1); - - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[0] &= kmask1; - - const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8); - - const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh); - const int32x4_t v_minse = vec_mule(v_ysums, v_minsh); - const int32x4_t v_mins = v_minso + v_minse; - sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]); - - const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * GGML_RESTRICT x0 = x[i].qs; - const int8_t * GGML_RESTRICT y0 = y[i].qs; - - int32_t sumi1 = 0; - int32_t sumi2 = 0; - - for (int j = 0; j < QK_K/64; ++j) { - v_x[0] = vec_xl(0 , x0); - v_x[1] = vec_xl(16, x0); - x0 += 32; - - v_y[0] = vec_xl(0 , y0); - v_y[1] = vec_xl(16, y0); - y0 += 32; - - v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm); - v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm); - - const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]); - sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0]; - - v_y[0] = vec_xl(0 , y0); - v_y[1] = vec_xl(16, y0); - y0 += 32; - - v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4); - v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4); - - const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]); - sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1]; - } - - sumf += d * (sumi1 + sumi2); - } - - *s = sumf; -#else - - const uint8_t * scales = (const uint8_t*)&utmp[0]; - const uint8_t * mins = (const uint8_t*)&utmp[2]; - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * GGML_RESTRICT a = aux8; - for (int j = 0; j < QK_K/64; ++j) { - for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); - a += 32; - for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); - a += 32; q4 += 32; - } - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - int sumi = 0; - for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; - a = aux8; - int is = 0; - for (int j = 0; j < QK_K/32; ++j) { - int32_t scale = scales[is++]; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; - sumf -= dmin * sumi; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} - -void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q5_K * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - - static const uint32_t kmask1 = 0x3f3f3f3f; - static const uint32_t kmask2 = 0x0f0f0f0f; - static const uint32_t kmask3 = 0x03030303; - - uint32_t utmp[4]; - -#ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); - const uint8x16_t mone = vdupq_n_u8(1); - const uint8x16_t mtwo = vdupq_n_u8(2); - const int32x4_t mzero = vdupq_n_s32(0); - - ggml_int8x16x4_t q5bytes; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8); - const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8)); - const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)), - vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins))); - int32_t sumi_mins = vaddvq_s32(prod); - - const uint8_t * scales = (const uint8_t *)utmp; - - const uint8_t * GGML_RESTRICT q5 = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); - - ggml_uint8x16x4_t q5h; - - int32_t sumi = 0; - - for (int j = 0; j < QK_K/64; ++j) { - - const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32; - const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; - - q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4); - q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4); - q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3); - q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3); - qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2); - qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2); - - q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0])); - q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1])); - q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2])); - q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3])); - - sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++; - sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++; - } - - sumf += d * sumi - dmin * sumi_mins; - } - - *s = sumf; - -#elif defined __AVX2__ - - const __m256i m4 = _mm256_set1_epi8(0xF); - const __m128i mzero = _mm_setzero_si128(); - const __m256i mone = _mm256_set1_epi8(1); - - __m256 acc = _mm256_setzero_ps(); - - float summs = 0.f; - - for (int i = 0; i < nb; ++i) { - const uint8_t * GGML_RESTRICT q5 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); - - const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums); - const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); - const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s); - const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero); - summs += dmin * _mm_extract_epi32(hsum, 0); - - const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); - const __m256i scales = MM256_SET_M128I(sc128, sc128); - - const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh); - __m256i hmask = mone; - - __m256i sumi = _mm256_setzero_si256(); - - int bit = 0; - - for (int j = 0; j < QK_K/64; ++j) { - - const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0)); - const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1)); - - const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32; - - const __m256i q5l_0 = _mm256_and_si256(q5bits, m4); - const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4); - const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0); - hmask = _mm256_slli_epi16(hmask, 1); - - const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4); - const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4); - const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1); - hmask = _mm256_slli_epi16(hmask, 1); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - - __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0); - __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1); - - p16_0 = _mm256_madd_epi16(scale_0, p16_0); - p16_1 = _mm256_madd_epi16(scale_1, p16_1); - - sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); - - } - - __m256 vd = _mm256_set1_ps(d); - acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc); - - } - - *s = hsum_float_8(acc) + summs; - -#elif defined __AVX__ - - const __m128i m4 = _mm_set1_epi8(0xF); - const __m128i mzero = _mm_setzero_si128(); - const __m128i mone = _mm_set1_epi8(1); - const __m128i m2 = _mm_set1_epi8(2); - - __m256 acc = _mm256_setzero_ps(); - - float summs = 0.f; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * GGML_RESTRICT q5 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]); - const __m128i scales = _mm_cvtepu8_epi16(utmps); - const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps)); - - const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]); - const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]); - const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1); - const __m128i prod = _mm_madd_epi16(mins, q8s); - const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero); - summs += dmin * _mm_extract_epi32(hsum, 0); - - const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]); - const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]); - __m128i hmask = mone; - - __m128i sumi_0 = _mm_setzero_si128(); - __m128i sumi_1 = _mm_setzero_si128(); - - int bit = 0; - - __m128i shuffle = _mm_set1_epi16(0x0100); - for (int j = 0; j < QK_K/64; ++j) { - - const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi16(shuffle, m2); - const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle); - shuffle = _mm_add_epi16(shuffle, m2); - - const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16; - const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16; - - __m128i q5l_0 = _mm_and_si128(q5bits_0, m4); - __m128i q5l_1 = _mm_and_si128(q5bits_1, m4); - __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4); - __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4); - __m128i q5_0 = _mm_add_epi8(q5l_0, q5h_0); - __m128i q5_1 = _mm_add_epi8(q5l_1, q5h_1); - hmask = _mm_slli_epi16(hmask, 1); - - __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0); - __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1); - p16_0 = _mm_madd_epi16(scale_0, p16_0); - p16_1 = _mm_madd_epi16(scale_0, p16_1); - - q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4); - q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4); - q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4); - q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4); - q5_0 = _mm_add_epi8(q5l_0, q5h_0); - q5_1 = _mm_add_epi8(q5l_1, q5h_1); - hmask = _mm_slli_epi16(hmask, 1); - - q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0); - __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1); - p16_2 = _mm_madd_epi16(scale_1, p16_2); - p16_3 = _mm_madd_epi16(scale_1, p16_3); - - sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); - sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); - - } - - __m256 vd = _mm256_set1_ps(d); - __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); - acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); - - } - - *s = hsum_float_8(acc) + summs; - -#elif defined __wasm_simd128__ - //const uint8_t * scales = (const uint8_t*)&utmp[0]; - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign - - const uint8_t * GGML_RESTRICT q5 = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - // Process scales and mins - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - // Sum mins * q8sums - int32_t sumi_mins = 0; - const int16_t * GGML_RESTRICT q8sums = y[i].bsums; - const uint8_t * m = (const uint8_t *)&utmp[2]; - for (int j = 0; j < 16; j += 2) { - sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2]; - } - sumf -= dmin * sumi_mins; // Correct subtraction - - v128_t qh0 = wasm_v128_load(qh); - v128_t qh1 = wasm_v128_load(qh + 16); - const uint8_t * sc = (const uint8_t *)utmp; - - int32_t sumi = 0; - - for (int j = 0; j < QK_K/64; ++j) { - const int shift = j * 2; - v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift); - v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift); - - v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4); - v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3); - v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4); - v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3); - - v128_t q5_0 = wasm_v128_load(q5); - v128_t q5_1 = wasm_v128_load(q5 + 16); - q5 += 32; - - v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0); - v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0); - v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1); - v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1); - - v128_t q8_0 = wasm_v128_load(q8); - v128_t q8_1 = wasm_v128_load(q8 + 16); - v128_t q8_2 = wasm_v128_load(q8 + 32); - v128_t q8_3 = wasm_v128_load(q8 + 48); - q8 += 64; - - // Process low quants - v128_t pl0 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q5l_0), - wasm_i16x8_extend_low_i8x16(q8_0) - ); - pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q5l_0), - wasm_i16x8_extend_high_i8x16(q8_0) - )); - v128_t pl1 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q5l_1), - wasm_i16x8_extend_low_i8x16(q8_1) - ); - pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q5l_1), - wasm_i16x8_extend_high_i8x16(q8_1) - )); - v128_t sum_low = wasm_i32x4_add(pl0, pl1); - - // Process high quants - v128_t ph0 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q5h_0), - wasm_i16x8_extend_low_i8x16(q8_2) - ); - ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q5h_0), - wasm_i16x8_extend_high_i8x16(q8_2) - )); - v128_t ph1 = wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_low_i8x16(q5h_1), - wasm_i16x8_extend_low_i8x16(q8_3) - ); - ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8( - wasm_i16x8_extend_high_i8x16(q5h_1), - wasm_i16x8_extend_high_i8x16(q8_3) - )); - v128_t sum_high = wasm_i32x4_add(ph0, ph1); - - // Accumulate with scale factors - int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) + - wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3); - int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) + - wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3); - - sumi += sl * sc[2*j] + sh * sc[2*j+1]; - } - - sumf += d * sumi; - } - - *s = sumf; - -#elif defined __riscv_v - - const uint8_t * scales = (const uint8_t*)&utmp[0]; - const uint8_t * mins = (const uint8_t*)&utmp[2]; - - float sumf = 0; - float sums = 0.0; - - size_t vl; - - for (int i = 0; i < nb; ++i) { - - vl = 8; - - const uint8_t * GGML_RESTRICT q5 = x[i].qs; - const uint8_t * GGML_RESTRICT hm = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; - - vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl); - vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl); - vint16m1_t q8sums = __riscv_vadd_vv_i16m1(q8sums_0, q8sums_1, vl); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - vuint8mf2_t mins8 = __riscv_vle8_v_u8mf2(mins, vl); - vint16m1_t v_mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl)); - vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, v_mins, vl); - - vint32m1_t sumi = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); - sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); - - vl = 32; - int32_t aux32 = 0; - int is = 0; - - uint8_t m = 1; - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); - vuint8m2_t vqh = __riscv_vle8_v_u8m2(hm, vl); - - for (int j = 0; j < QK_K/64; ++j) { - // load Q5 and Q8 - vuint8m2_t q5_x = __riscv_vle8_v_u8m2(q5, vl); - vint8m2_t q8_y1 = __riscv_vle8_v_i8m2(q8, vl); - vint8m2_t q8_y2 = __riscv_vle8_v_i8m2(q8+32, vl); - - // compute mask for addition - vint8m2_t q5_a = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vx_u8m2(q5_x, 0x0F, vl)); - vuint8m2_t qh_m1 = __riscv_vand_vx_u8m2(vqh, m, vl); - vbool4_t vmask_1 = __riscv_vmsne_vx_u8m2_b4(qh_m1, 0, vl); - vint8m2_t q5_m1 = __riscv_vadd_vx_i8m2_mu(vmask_1, q5_a, q5_a, 16, vl); - m <<= 1; - - vint8m2_t q5_l = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vsrl_vx_u8m2(q5_x, 0x04, vl)); - vuint8m2_t qh_m2 = __riscv_vand_vx_u8m2(vqh, m, vl); - vbool4_t vmask_2 = __riscv_vmsne_vx_u8m2_b4(qh_m2, 0, vl); - vint8m2_t q5_m2 = __riscv_vadd_vx_i8m2_mu(vmask_2, q5_l, q5_l, 16, vl); - m <<= 1; - - vint16m4_t v0 = __riscv_vwmul_vv_i16m4(q5_m1, q8_y1, vl); - vint16m4_t v1 = __riscv_vwmul_vv_i16m4(q5_m2, q8_y2, vl); - - vint32m8_t vs1 = __riscv_vwmul_vx_i32m8(v0, scales[is++], vl); - vint32m8_t vs2 = __riscv_vwmul_vx_i32m8(v1, scales[is++], vl); - - vint32m1_t vacc1 = __riscv_vredsum_vs_i32m8_i32m1(vs1, vzero, vl); - vint32m1_t vacc2 = __riscv_vredsum_vs_i32m8_i32m1(vs2, vacc1, vl); - - aux32 += __riscv_vmv_x_s_i32m1_i32(vacc2); - q5 += 32; q8 += 64; - - } - - sums += aux32 * d; - - } - - *s = sumf+sums; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector signed char lowMask1 = vec_splats((int8_t)0x3f); - const vector signed char lowMask2 = vec_splats((int8_t)0x30); - const vector int v0 = vec_splats((int32_t)0); - const vector unsigned char v1 = vec_splats((unsigned char)0x1); - const vector unsigned char v2 = vec_splats((unsigned char)0x2); - const vector unsigned char v3 = vec_splats((unsigned char)0x3); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); - vector float vdmin = vec_mul(vxmin, vyd); - - UNUSED(kmask1); - UNUSED(kmask2); - UNUSED(kmask3); - UNUSED(utmp); - - vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); - vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2); - vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); - vector signed char u3 = vec_sr(u2, v4); - - vector signed char u30 = u1; - vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3); - - u1 = vec_and(u0, lowMask1); - u2 = vec_or(u30, u31); - - vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2); - - vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); - vector signed short q8ysums1 = vec_xl(16, y[i].bsums); - - vector signed short vscales = vec_unpackh(utmps); - - vector signed short q5xmins = vec_unpackl(utmps); - vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins); - vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins); - - vector signed int prod0 = vec_mule(q5xmins0, q8ysums0); - vector signed int prod1 = vec_mule(q5xmins1, q8ysums1); - vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0); - vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1); - - vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); - vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); - vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); - vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); - - vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh); - vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - - const uint8_t * GGML_RESTRICT q5 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - for (int j = 0; j < QK_K/64; ++j) { - __builtin_prefetch(q5, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, q5); - vector signed char qxs1 = (vector signed char)vec_xl(16, q5); - q5 += 32; - - vector signed char qxs00 = vec_and(qxs0, lowMask); - vector signed char qxs01 = vec_sr(qxs0, v4); - vector signed char qxs10 = vec_and(qxs1, lowMask); - vector signed char qxs11 = vec_sr(qxs1, v4); - - vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4); - vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3); - vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4); - vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3); - qxhs0 = vec_sr(qxhs0, v2); - qxhs1 = vec_sr(qxhs1, v2); - - vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00); - vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01); - vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10); - vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11); - - vector signed char q8y00 = vec_xl( 0, q8); - vector signed char q8y10 = vec_xl(16, q8); - vector signed char q8y01 = vec_xl(32, q8); - vector signed char q8y11 = vec_xl(48, q8); - q8 += 64; - - vector signed int qv00 = vec_msum(q8y00, q5x00, v0); - vector signed int qv01 = vec_msum(q8y01, q5x01, v0); - vector signed int qv10 = vec_msum(q8y10, q5x10, v0); - vector signed int qv11 = vec_msum(q8y11, q5x11, v0); - - vector signed int vscales_h = vec_unpackh(vscales); - vector signed int vs0 = vec_splat(vscales_h, 0); - vector signed int vs1 = vec_splat(vscales_h, 1); - vscales = vec_sld(vscales, vscales, 12); - - vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0); - vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1); - vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2); - vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - __m256 acc = (__m256)__lasx_xvldi(0); - __m128 acc_m = (__m128)__lsx_vldi(0); - - for (int i = 0; i < nb; ++i) { - - const uint8_t * GGML_RESTRICT q5 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]); - const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128); - const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0); - - const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0); - const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1)); - const __m128i prod = lsx_madd_h(mins128, q8s); - acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m); - - const __m256i scales = lasx_insertf128(scales128, scales128); - - const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0); - - __m256i sumi = __lasx_xvldi(0); - - for (int j = 0; j < QK_K/64; ++j) { - - const __m256i scale_0 = lasx_xvrepl128vei_h(scales, 2 * j + 0); - const __m256i scale_1 = lasx_xvrepl128vei_h(scales, 2 * j + 1); - - const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32; - - const __m256i q5l_0 = __lasx_xvandi_b(q5bits, 0xf); - const __m256i q5l_1 = __lasx_xvsrli_b(q5bits, 4); - const __m256i q5h_0 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 0), 0), 0xef); - const __m256i q5h_1 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 1), 0), 0xef); - const __m256i q5_0 = __lasx_xvor_v(q5l_0, q5h_0); - const __m256i q5_1 = __lasx_xvor_v(q5l_1, q5h_1); - - const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - - __m256i p16_0 = lasx_madd_h_b(q5_0, q8_0); - __m256i p16_1 = lasx_madd_h_b(q5_1, q8_1); - - p16_0 = lasx_madd_h(scale_0, p16_0); - p16_1 = lasx_madd_h(scale_1, p16_1); - - sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); - - } - - __m256 vd = __lasx_xvreplfr2vr_s(d); - acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); - - } - - acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 8)); - acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4)); - - *s = hsum_float_8(acc) + ((v4f32)acc_m)[0]; -#elif defined(__VXE__) || defined(__VXE2__) - const uint8x16_t v_lm = vec_splat_u8(0x0F); - const uint8x16_t v_1m = vec_splat_u8(0x01); - const uint8x16_t v_2m = vec_splat_u8(0x02); - - const int32x4_t v_z = vec_splat_s32(0); - - const uchar8x16_t v_minsm = { - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF - }; - - int8x16_t q5b[4]; - uint8x16_t q5h[4]; - - uint8x16_t v_xl[2]; - uint8x16_t v_xh[2]; - int8x16_t v_y[4]; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); - const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); - const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh); - - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp); - const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm); - const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8); - - const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh); - const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh); - const int32x4_t v_mins = vec_add(v_minsho, v_minshe); - const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]; - - const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * GGML_RESTRICT x0l = x[i].qs; - const uint8_t * GGML_RESTRICT x0h = x[i].qh; - const int8_t * GGML_RESTRICT y0 = y[i].qs; - - v_xh[0] = vec_xl(0 , x0h); - v_xh[1] = vec_xl(16, x0h); - - int32_t sumi = 0; - for (int j = 0; j < QK_K/64; ++j) { - v_xl[0] = vec_xl(0 , x0l); - v_xl[1] = vec_xl(16, x0l); - x0l += 32; - - v_y[0] = vec_xl(0 , y0); - v_y[1] = vec_xl(16, y0); - v_y[2] = vec_xl(32, y0); - v_y[3] = vec_xl(48, y0); - y0 += 64; - - q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4); - q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4); - q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3); - q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3); - v_xh[0] = vec_sr(v_xh[0], 2); - v_xh[1] = vec_sr(v_xh[1], 2); - - q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]); - q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]); - q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]); - q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]); - - int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]); - int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]); - - sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++; - sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++; - } - - sumf += d * sumi - dmin * mins; - } - - *s = sumf; -#else - - const uint8_t * scales = (const uint8_t*)&utmp[0]; - const uint8_t * mins = (const uint8_t*)&utmp[2]; - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * GGML_RESTRICT q4 = x[i].qs; - const uint8_t * GGML_RESTRICT hm = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * GGML_RESTRICT a = aux8; - uint8_t m = 1; - for (int j = 0; j < QK_K/64; ++j) { - for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); - for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); - a += 32; m <<= 1; - for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); - for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); - a += 32; m <<= 1; - q4 += 32; - } - memcpy(utmp, x[i].scales, 12); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; - - int sumi = 0; - for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; - a = aux8; - int is = 0; - for (int j = 0; j < QK_K/32; ++j) { - int32_t scale = scales[is++]; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; - sumf -= dmin * sumi; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} - -void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); -#ifdef __ARM_FEATURE_MATMUL_INT8 - assert((nrc == 2) || (nrc == 1)); -#else - assert(nrc == 1); -#endif - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q6_K * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_FEATURE_MATMUL_INT8) - if (nrc == 2) { - const block_q6_K * GGML_RESTRICT x0 = x; - const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx); - const block_q8_K * GGML_RESTRICT y0 = y; - const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by); - - float32x4_t vfsum = vdupq_n_f32(0.0f); - - for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) { - const uint8_t * GGML_RESTRICT ql0 = x0->ql; - const uint8_t * GGML_RESTRICT ql1 = x1->ql; - const uint8_t * GGML_RESTRICT qh0 = x0->qh; - const uint8_t * GGML_RESTRICT qh1 = x1->qh; - const int8_t * GGML_RESTRICT qy0 = y0->qs; - const int8_t * GGML_RESTRICT qy1 = y1->qs; - - const uint8x16_t mone = vdupq_n_u8(0x30); - const uint8x16_t m4b = vdupq_n_u8(0x0f); - - int32x4_t visum = vdupq_n_s32(0); - - // process 8 blocks per iteration, totally 16 blocks - for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) { - int8x16_t vx0[8], vx1[8]; - - // de-quantize vx0[8] - { - const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0); - const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0); - - uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4)); - uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4)); - uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2)); - uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2)); - - vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0)); - vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1)); - vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2)); - vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3)); - - q6h_0 = vandq_u8(mone, qh_bits.val[0]); - q6h_1 = vandq_u8(mone, qh_bits.val[1]); - q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2)); - q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2)); - - vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0)); - vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1)); - vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2)); - vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3)); - } - - // de-quantize vx1[8] - { - const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1); - const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1); - - uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4)); - uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4)); - uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2)); - uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2)); - - vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0)); - vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1)); - vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2)); - vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3)); - - q6h_0 = vandq_u8(mone, qh_bits.val[0]); - q6h_1 = vandq_u8(mone, qh_bits.val[1]); - q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2)); - q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2)); - - vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0)); - vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1)); - vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2)); - vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3)); - } - - // process 16 elements (one block with same scale) per iteration - // - vx = concat(ql, qh) - 32 - // - r1,r2,r3,r4 = smmla(vx, vy) - for (int k = 0; k < 8; ++k) { - const int blk = j * 8 + k; - - const int8x16_t vy0 = vld1q_s8(qy0); - const int8x16_t vy1 = vld1q_s8(qy1); - qy0 += 16; - qy1 += 16; - - const int32x4_t block_scale = { - x0->scales[blk], - x0->scales[blk], - x1->scales[blk], - x1->scales[blk], - }; - - // calculate four results at once with outer product - const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k]))); - const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k]))); - const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1))); - const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1))); - int32x4_t vr = vdupq_n_s32(0); - vr = vmmlaq_s32(vr, vx_l, vy_l); - vr = vmmlaq_s32(vr, vx_h, vy_h); - - // apply block scale, will NOT overflow - // block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits - visum = vmlaq_s32(visum, vr, block_scale); - } - } - - // adjust bias, apply superblock scale - { - int32_t bias[4]; -#ifdef __ARM_FEATURE_SVE - const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8); - const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8); - const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums); - const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8); - const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums); - const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8); - const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales)); - const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8)); - const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales)); - const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8)); - const svint64_t zero = svdup_n_s64(0); - bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0), - svdot_s64(zero, y0_q8sums_1, x0_q6scales_1))); - bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0), - svdot_s64(zero, y1_q8sums_1, x0_q6scales_1))); - bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0), - svdot_s64(zero, y0_q8sums_1, x1_q6scales_1))); - bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0), - svdot_s64(zero, y1_q8sums_1, x1_q6scales_1))); -#else - // NEON doesn't support int16 dot product, fallback to separated mul and add - const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums); - const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums); - - int8x16_t scales_s8 = vld1q_s8(x0->scales); - const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}}; - scales_s8 = vld1q_s8(x1->scales); - const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}}; - - int32x4_t prod; - prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])), - vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))), - vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])), - vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1])))); - bias[0] = vaddvq_s32(prod); - prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])), - vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))), - vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])), - vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1])))); - bias[1] = vaddvq_s32(prod); - prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])), - vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))), - vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])), - vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1])))); - bias[2] = vaddvq_s32(prod); - prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])), - vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))), - vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])), - vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1])))); - bias[3] = vaddvq_s32(prod); - -#endif - const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32); - - const float32x4_t superblock_scale = { - GGML_FP16_TO_FP32(x0->d) * y0->d, - GGML_FP16_TO_FP32(x0->d) * y1->d, - GGML_FP16_TO_FP32(x1->d) * y0->d, - GGML_FP16_TO_FP32(x1->d) * y1->d, - }; - - visum = vsubq_s32(visum, vibias); - vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale); - } - } - - // vfsum = ABCD -> ACBD - // AC -> s, BD -> (s+bs) - vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2)); - vst1_f32(s, vget_low_f32 (vfsum)); - vst1_f32(s + bs, vget_high_f32(vfsum)); - - return; - } -#endif - -#ifdef __ARM_FEATURE_SVE - const int vector_length = ggml_cpu_get_sve_cnt()*8; - float sum = 0; - svuint8_t m4b = svdup_n_u8(0xf); - svint32_t vzero = svdup_n_s32(0); - svuint8_t mone = svdup_n_u8(0x30); - svint8_t q6bytes_1, q6bytes_2, q6bytes_3, q6bytes_4; - svuint8_t q6h_1, q6h_2, q6h_3, q6h_4; - - for (int i = 0; i < nb; ++i) { - const float d_all = GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q6 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const int8_t * GGML_RESTRICT scale = x[i].scales; - - const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8); - const svint16_t q8sums_1 = svld1_s16(pg16_8, y[i].bsums); - const svint16_t q8sums_2 = svld1_s16(pg16_8, y[i].bsums + 8); - const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale)); - const svint16_t q6scales_2 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale + 8)); - const svint64_t prod = svdup_n_s64(0); - int32_t isum_mins = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(prod, q8sums_1, q6scales_1), - svdot_s64(prod, q8sums_2, q6scales_2))); - int32_t isum = 0; - - switch (vector_length) { - case 128: - { - const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4); - const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16); - svint32_t isum_tmp = svdup_n_s32(0); - for (int j = 0; j < QK_K/128; ++j) { - svuint8_t qhbits_1 = svld1_u8(pg8_16, qh); - svuint8_t qhbits_2 = svld1_u8(pg8_16, qh+16); - qh += 32; - svuint8_t q6bits_1 = svld1_u8(pg8_16, q6); - svuint8_t q6bits_2 = svld1_u8(pg8_16, q6+16); - svuint8_t q6bits_3 = svld1_u8(pg8_16, q6+32); - svuint8_t q6bits_4 = svld1_u8(pg8_16, q6+48); - q6 += 64; - svint8_t q8bytes_1 = svld1_s8(pg8_16, q8); - svint8_t q8bytes_2 = svld1_s8(pg8_16, q8+16); - svint8_t q8bytes_3 = svld1_s8(pg8_16, q8+32); - svint8_t q8bytes_4 = svld1_s8(pg8_16, q8+48); - q8 += 64; - - q6h_1 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 4)); - q6h_2 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 4)); - q6h_3 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 2)); - q6h_4 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 2)); - q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_1, m4b), q6h_1)); - q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_2, m4b), q6h_2)); - q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_3, m4b), q6h_3)); - q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_4, m4b), q6h_4)); - isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]); - isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]); - isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]); - isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]); - - scale += 4; - q8bytes_1 = svld1_s8(pg8_16, q8); - q8bytes_2 = svld1_s8(pg8_16, q8+16); - q8bytes_3 = svld1_s8(pg8_16, q8+32); - q8bytes_4 = svld1_s8(pg8_16, q8+48); - q8 += 64; - - q6h_1 = svand_u8_x(pg16_8, mone, qhbits_1); - q6h_2 = svand_u8_x(pg16_8, mone, qhbits_2); - q6h_3 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_1, 2)); - q6h_4 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_2, 2)); - q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_1, 4), q6h_1)); - q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_2, 4), q6h_2)); - q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_3, 4), q6h_3)); - q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_4, 4), q6h_4)); - isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]); - isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]); - isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]); - isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]); - scale += 4; - } - isum += svaddv_s32(pg32_4, isum_tmp); - sum += d_all * y[i].d * (isum - 32 * isum_mins); - } - break; - case 256: - case 512: - { - const svbool_t pg8_2 = svptrue_pat_b8(SV_VL2); - const svbool_t pg32_8 = svptrue_pat_b32(SV_VL8); - const svbool_t pg8_32 = svptrue_pat_b8(SV_VL32); - svint32_t isum_tmp = svdup_n_s32(0); - for (int j = 0; j < QK_K/128; j++) { - svuint8_t qhbits_1 = svld1_u8(pg8_32, qh); - qh += 32; - svuint8_t q6bits_1 = svld1_u8(pg8_32, q6); - svuint8_t q6bits_2 = svld1_u8(pg8_32, q6+32); - q6 += 64; - svint8_t q8bytes_1 = svld1_s8(pg8_32, q8); - svint8_t q8bytes_2 = svld1_s8(pg8_32, q8+32); - svint8_t q8bytes_3 = svld1_s8(pg8_32, q8+64); - svint8_t q8bytes_4 = svld1_s8(pg8_32, q8+96); - q8 += 128; - q6h_1 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 4)); - q6h_2 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 2)); - q6h_3 = svand_u8_x(pg8_32, mone, qhbits_1); - q6h_4 = svand_u8_x(pg8_32, mone, svlsr_n_u8_x(pg8_32, qhbits_1, 2)); - q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_1, m4b), q6h_1)); - q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_2, m4b), q6h_2)); - q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_1, 4), q6h_3)); - q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_2, 4), q6h_4)); - - svint8_t scale_lane_1_tmp = svld1_s8(pg8_2, scale); - scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp); - scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp); - svint8_t scale_lane_2_tmp = svld1_s8(pg8_2, scale+2); - scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp); - scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp); - svint8_t scale_lane_3_tmp = svld1_s8(pg8_2, scale+4); - scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp); - scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp); - svint8_t scale_lane_4_tmp = svld1_s8(pg8_2, scale+6); - scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp); - scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp); - svint32_t scale_lane_1 = svunpklo_s32(svunpklo_s16(scale_lane_1_tmp)); - svint32_t scale_lane_2 = svunpklo_s32(svunpklo_s16(scale_lane_2_tmp)); - svint32_t scale_lane_3 = svunpklo_s32(svunpklo_s16(scale_lane_3_tmp)); - svint32_t scale_lane_4 = svunpklo_s32(svunpklo_s16(scale_lane_4_tmp)); - - isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale_lane_1); - isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale_lane_2); - isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale_lane_3); - isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale_lane_4); - scale += 8; - } - isum += svaddv_s32(pg32_8, isum_tmp); - sum += d_all * y[i].d * (isum - 32 * isum_mins); - } - break; - default: - assert(false && "Unsupported vector length"); - break; - } - } - - *s = sum; - -#elif __ARM_NEON - float sum = 0; - - const uint8x16_t m4b = vdupq_n_u8(0xF); - const int32x4_t vzero = vdupq_n_s32(0); - //const int8x16_t m32s = vdupq_n_s8(32); - - const uint8x16_t mone = vdupq_n_u8(3); - - ggml_int8x16x4_t q6bytes; - ggml_uint8x16x4_t q6h; - - for (int i = 0; i < nb; ++i) { - - const float d_all = GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q6 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const int8_t * GGML_RESTRICT scale = x[i].scales; - - const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); - const int8x16_t scales = vld1q_s8(scale); - const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}}; - - const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])), - vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))), - vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])), - vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1])))); - int32_t isum_mins = vaddvq_s32(prod); - - int32_t isum = 0; - - for (int j = 0; j < QK_K/128; ++j) { - - ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32; - ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64; - ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; - - q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4); - q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4); - uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2); - q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - shifted = vshrq_n_u8(qhbits.val[1], 2); - q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - - //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s); - //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s); - //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s); - //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s); - q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])); - q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])); - q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])); - q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])); - - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; - - scale += 4; - - q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; - - shifted = vshrq_n_u8(qhbits.val[0], 4); - q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - shifted = vshrq_n_u8(qhbits.val[1], 4); - q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - shifted = vshrq_n_u8(qhbits.val[0], 6); - q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - shifted = vshrq_n_u8(qhbits.val[1], 6); - q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - - //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s); - //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s); - //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s); - //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s); - q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])); - q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])); - q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])); - q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])); - - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; - scale += 4; - } - //sum += isum * d_all * y[i].d; - sum += d_all * y[i].d * (isum - 32 * isum_mins); - - } - *s = sum; - -#elif defined __AVX2__ - - const __m256i m4 = _mm256_set1_epi8(0xF); - const __m256i m2 = _mm256_set1_epi8(3); - const __m256i m32s = _mm256_set1_epi8(32); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q4 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); - - __m256i sumi = _mm256_setzero_si256(); - - int is = 0; - - for (int j = 0; j < QK_K/128; ++j) { - - const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); - const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); - const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); - const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); - is += 4; - - const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; - const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; - const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32; - - const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4); - const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4); - const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4); - const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4); - - const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); - const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1); - const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2); - const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - - __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0); - __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1); - __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2); - __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3); - - __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0); - __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1); - __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2); - __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3); - - p16_0 = _mm256_sub_epi16(p16_0, q8s_0); - p16_1 = _mm256_sub_epi16(p16_1, q8s_1); - p16_2 = _mm256_sub_epi16(p16_2, q8s_2); - p16_3 = _mm256_sub_epi16(p16_3, q8s_3); - - p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0); - p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1); - p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2); - p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3); - - sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); - sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3)); - - } - - acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); - } - - *s = hsum_float_8(acc); - -#elif defined __AVX__ - - const __m128i m3 = _mm_set1_epi8(3); - const __m128i m15 = _mm_set1_epi8(15); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q4 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - // handle the q6_k -32 offset separately using bsums - const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums); - const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1); - const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); - const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales); - const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8)); - const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5); - const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5); - - __m128i sumi_0 = _mm_setzero_si128(); - __m128i sumi_1 = _mm_setzero_si128(); - - int is = 0; - - for (int j = 0; j < QK_K/128; ++j) { - - const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16; - const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16; - - const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4); - const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4); - const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2); - const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2); - const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48)); - const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48)); - const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2); - const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2); - - const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; - const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; - const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; - const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; - - const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0); - const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1); - const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2); - const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3); - const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4); - const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5); - const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6); - const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7); - - const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; - - __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0); - __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1); - __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2); - __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3); - __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4); - __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5); - __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6); - __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7); - - const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); - const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); - const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); - const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); - is += 4; - - p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); - p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1); - p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); - p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3); - p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4); - p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5); - p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6); - p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7); - - sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); - sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); - sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6)); - sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7)); - - } - - sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0); - sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1); - const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc); - } - - *s = hsum_float_8(acc); - -#elif defined __wasm_simd128__ - int8_t aux8[QK_K] __attribute__((aligned(16))); - int32_t aux32[8] __attribute__((aligned(16))) = {0}; - float sums[8] __attribute__((aligned(16))) = {0}; - - for (int i = 0; i < nb; ++i) { - // Unpack 6-bit quantized data into aux8 (unchanged) - const uint8_t * GGML_RESTRICT q4 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - int8_t * a = aux8; - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) { - a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - } - a += 128; - q4 += 64; - qh += 32; - } - - const int8_t * GGML_RESTRICT a_ptr = aux8; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - v128_t acc0 = wasm_i32x4_splat(0); - v128_t acc1 = wasm_i32x4_splat(0); - - for (int j = 0; j < QK_K/16; ++j) { - const int scale = x[i].scales[j]; - const v128_t vscale = wasm_i32x4_splat(scale); - - // Load 16 elements from a and q8 - const v128_t a_vec = wasm_v128_load(a_ptr); - const v128_t q8_vec = wasm_v128_load(q8); - - // Process low 8 elements - v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec); - v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec); - v128_t prod_low = wasm_i16x8_mul(a_low, q8_low); - v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low); - v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low); - - // Process high 8 elements - v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec); - v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec); - v128_t prod_high = wasm_i16x8_mul(a_high, q8_high); - v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high); - v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high); - - // Scale and accumulate - prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale); - prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale); - prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale); - prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale); - - acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo)); - acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi)); - - a_ptr += 16; - q8 += 16; - } - - // Store accumulated results - wasm_v128_store(&aux32[0], acc0); - wasm_v128_store(&aux32[4], acc1); - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) { - sums[l] += d * aux32[l]; - } - } - - // Sum final results - float sumf = 0; - for (int l = 0; l < 8; ++l) { - sumf += sums[l]; - } - *s = sumf; - -#elif defined __riscv_xtheadvector - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const int8_t * restrict scale = x[i].scales; - - int sum_t = 0; - int t0; - - for (int j = 0; j < QK_K/128; ++j) { - __asm__ __volatile__( - "th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32 - "th.vlb.v v4, (%[qh])\n\t" - "th.vsll.vi v0, v4, 4\n\t" - "th.vsll.vi v2, v4, 2\n\t" - "th.vsrl.vi v6, v4, 2\n\t" - "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64 - "th.vlb.v v8, (%[q6])\n\t" - "th.vsrl.vi v12, v8, 4\n\t" - "th.vand.vi v8, v8, 0xF\n\t" - "th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128 - "th.vand.vx v0, v0, %[mask]\n\t" - "th.vor.vv v8, v8, v0\n\t" - "th.vlb.v v0, (%[q8])\n\t" - "th.vsub.vx v8, v8, %[vl32]\n\t" - "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64 - "th.vwmul.vv v16, v0, v8\n\t" - "th.vwmul.vv v24, v4, v12\n\t" - "li %[t0], 16\n\t" - "th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16 - "th.vmv.v.x v0, zero\n\t" - "th.vwredsum.vs v10, v16, v0\n\t" - "th.vwredsum.vs v9, v18, v0\n\t" - "th.vwredsum.vs v8, v20, v0\n\t" - "th.vwredsum.vs v7, v22, v0\n\t" - "th.vwredsum.vs v11, v24, v0\n\t" - "th.vwredsum.vs v12, v26, v0\n\t" - "th.vwredsum.vs v13, v28, v0\n\t" - "th.vwredsum.vs v14, v30, v0\n\t" - "li %[t0], 4\n\t" - "th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4 - "th.vslideup.vi v10, v9, 1\n\t" - "th.vslideup.vi v8, v7, 1\n\t" - "th.vslideup.vi v11, v12, 1\n\t" - "th.vslideup.vi v13, v14, 1\n\t" - "th.vslideup.vi v10, v8, 2\n\t" - "th.vslideup.vi v11, v13, 2\n\t" - "li %[t0], 8\n\t" - "th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8 - "th.vlb.v v4, (%[scale])\n\t" - "th.vmul.vv v2, v4, v10\n\t" - "th.vredsum.vs v0, v2, v0\n\t" - "th.vmv.x.s %[t0], v0\n\t" - "add %[sumi], %[sumi], %[t0]" - : [sumi] "+&r" (sum_t), [t0] "=&r" (t0) - : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale) - , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128) - , [mask] "r" (0x30) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - q6 += 64; qh += 32; q8 += 128; scale += 8; - } - - sumf += d * sum_t; - - } - - *s = sumf; - -#elif defined __riscv_v - - float sumf = 0; - const int vector_length = __riscv_vlenb() * 8; - - switch (vector_length) { - case 256: - for (int i = 0; i < nb; ++i) { - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - - const uint8_t * GGML_RESTRICT q6 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const int8_t * GGML_RESTRICT scale = x[i].scales; - - size_t vl; - - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); - - int sum_t = 0; - int is = 0; - - for (int j = 0; j < QK_K/128; ++j) { - - vl = 32; - - // load qh - vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl); - - // load Q6 - vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl); - vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl); - - vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl); - vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl); - vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl); - vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl); - - vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl); - vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl); - vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl); - vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl); - - vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl); - vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl); - vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl); - vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl); - - vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl); - vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl); - vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl); - vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl); - - // load Q8 and take product - vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl); - vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl); - vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl); - vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl); - - vl = 16; - - vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl); - vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl); - vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl); - vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl); - vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl); - vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl); - vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl); - vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl); - - vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl); - vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl); - vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl); - vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl); - - sum_t += __riscv_vmv_x_s_i32m1_i32(isum3); - - q6 += 64; qh += 32; q8 += 128; is=8; - - } - - sumf += d * sum_t; - - } - break; - case 128: - for (int i = 0; i < nb; ++i) { - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const int8_t * restrict scale = x[i].scales; - - int sum_t = 0; - int t0; - - for (int j = 0; j < QK_K/128; ++j) { - __asm__ __volatile__( - "vsetvli zero, %[vl32], e8, m2\n\t" - "vle8.v v4, (%[qh])\n\t" - "vsll.vi v0, v4, 4\n\t" - "vsll.vi v2, v4, 2\n\t" - "vsrl.vi v6, v4, 2\n\t" - "vsetvli zero, %[vl64], e8, m4\n\t" - "vle8.v v8, (%[q6])\n\t" - "vsrl.vi v12, v8, 4\n\t" - "vand.vi v8, v8, 0xF\n\t" - "vsetvli zero, %[vl128], e8, m8\n\t" - "vand.vx v0, v0, %[mask]\n\t" - "vor.vv v8, v8, v0\n\t" - "vle8.v v0, (%[q8])\n\t" - "vsub.vx v8, v8, %[vl32]\n\t" - "vsetvli zero, %[vl64], e8, m4\n\t" - "vwmul.vv v16, v0, v8\n\t" - "vwmul.vv v24, v4, v12\n\t" - "vsetivli zero, 16, e16, m2\n\t" - "vmv.v.x v0, zero\n\t" - "vwredsum.vs v10, v16, v0\n\t" - "vwredsum.vs v9, v18, v0\n\t" - "vwredsum.vs v8, v20, v0\n\t" - "vwredsum.vs v7, v22, v0\n\t" - "vwredsum.vs v11, v24, v0\n\t" - "vwredsum.vs v12, v26, v0\n\t" - "vwredsum.vs v13, v28, v0\n\t" - "vwredsum.vs v14, v30, v0\n\t" - "vsetivli zero, 4, e32, m1\n\t" - "vslideup.vi v10, v9, 1\n\t" - "vslideup.vi v8, v7, 1\n\t" - "vslideup.vi v11, v12, 1\n\t" - "vslideup.vi v13, v14, 1\n\t" - "vslideup.vi v10, v8, 2\n\t" - "vslideup.vi v11, v13, 2\n\t" - "vsetivli zero, 8, e32, m2\n\t" - "vle8.v v2, (%[scale])\n\t" - "vsext.vf4 v4, v2\n\t" - "vmul.vv v2, v4, v10\n\t" - "vredsum.vs v0, v2, v0\n\t" - "vmv.x.s %[t0], v0\n\t" - "add %[sumi], %[sumi], %[t0]" - : [sumi] "+&r" (sum_t), [t0] "=&r" (t0) - : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale) - , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128) - , [mask] "r" (0x30) - : "memory" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - q6 += 64; qh += 32; q8 += 128; scale += 8; - } - - sumf += d * sum_t; - - } - break; - default: - assert(false && "Unsupported vector length"); - break; - } - - *s = sumf; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector int v0 = vec_splats((int32_t)0); - const vector unsigned char v2 = vec_splats((unsigned char)0x2); - const vector unsigned char v3 = vec_splats((unsigned char)0x3); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - const vector unsigned char v6 = vec_splats((unsigned char)0x6); - const vector signed char off = vec_splats((signed char)0x20); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - vector signed int vsumi4 = v0; - vector signed int vsumi5 = v0; - vector signed int vsumi6 = v0; - vector signed int vsumi7 = v0; - - const uint8_t * GGML_RESTRICT q6 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT qs = x[i].scales; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - for (int j = 0; j < QK_K/128; ++j) { - __builtin_prefetch(q6, 0, 0); - __builtin_prefetch(qh, 0, 0); - __builtin_prefetch(q8, 0, 0); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, q6); - vector signed char qxs1 = (vector signed char)vec_xl(16, q6); - vector signed char qxs2 = (vector signed char)vec_xl(32, q6); - vector signed char qxs3 = (vector signed char)vec_xl(48, q6); - q6 += 64; - - vector signed char qxs00 = vec_and(qxs0, lowMask); - vector signed char qxs01 = vec_sr(qxs0, v4); - vector signed char qxs10 = vec_and(qxs1, lowMask); - vector signed char qxs11 = vec_sr(qxs1, v4); - vector signed char qxs20 = vec_and(qxs2, lowMask); - vector signed char qxs21 = vec_sr(qxs2, v4); - vector signed char qxs30 = vec_and(qxs3, lowMask); - vector signed char qxs31 = vec_sr(qxs3, v4); - - vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh); - vector signed char qxhs1 = (vector signed char)vec_xl(16, qh); - qh += 32; - - vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4); - vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4); - vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4); - vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4); - vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4); - vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4); - vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4); - vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4); - - vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off); - vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off); - vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off); - vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off); - vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off); - vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off); - vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off); - vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off); - - vector signed char q8y00 = vec_xl( 0, q8); - vector signed char q8y10 = vec_xl( 16, q8); - vector signed char q8y20 = vec_xl( 32, q8); - vector signed char q8y30 = vec_xl( 48, q8); - vector signed char q8y01 = vec_xl( 64, q8); - vector signed char q8y11 = vec_xl( 80, q8); - vector signed char q8y21 = vec_xl( 96, q8); - vector signed char q8y31 = vec_xl(112, q8); - q8 += 128; - - vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00)); - vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10)); - vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20)); - vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30)); - vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01)); - vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11)); - vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21)); - vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31)); - - vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8)); - qs += 8; - - vector signed short vs0 = vec_splat(vscales, 0); - vector signed short vs1 = vec_splat(vscales, 1); - vector signed short vs2 = vec_splat(vscales, 2); - vector signed short vs3 = vec_splat(vscales, 3); - vector signed short vs4 = vec_splat(vscales, 4); - vector signed short vs5 = vec_splat(vscales, 5); - vector signed short vs6 = vec_splat(vscales, 6); - vector signed short vs7 = vec_splat(vscales, 7); - - vsumi0 = vec_msum(qv00, vs0, vsumi0); - vsumi1 = vec_msum(qv01, vs4, vsumi1); - vsumi2 = vec_msum(qv10, vs1, vsumi2); - vsumi3 = vec_msum(qv11, vs5, vsumi3); - vsumi4 = vec_msum(qv20, vs2, vsumi4); - vsumi5 = vec_msum(qv21, vs6, vsumi5); - vsumi6 = vec_msum(qv30, vs3, vsumi6); - vsumi7 = vec_msum(qv31, vs7, vsumi7); - } - - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - const __m256i m32s = __lasx_xvreplgr2vr_b(32); - - __m256 acc = (__m256)__lasx_xvldi(0); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT q4 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0); - const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask)); - - __m256i sumi = __lasx_xvldi(0); - - for (int j = 0; j < QK_K/128; ++j) { - - const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; - const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32; - const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32; - - const __m256i q4h_0 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3), 4); - const __m256i q4h_1 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3 << 2), 2); - const __m256i q4h_2 = __lasx_xvandi_b(q4bitsH, 3 << 4); - const __m256i q4h_3 = __lasx_xvsrli_b(__lasx_xvandi_b(q4bitsH, 3 << 6), 2); - - const __m256i q4_0 = __lasx_xvor_v(__lasx_xvandi_b(q4bits1, 0xf), q4h_0); - const __m256i q4_1 = __lasx_xvor_v(__lasx_xvandi_b(q4bits2, 0xf), q4h_1); - const __m256i q4_2 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits1, 4), q4h_2); - const __m256i q4_3 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits2, 4), q4h_3); - - const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - - __m256i p16_0 = lasx_madd_h_b(__lasx_xvsub_b(q4_0, m32s), q8_0); - __m256i p16_1 = lasx_madd_h_b(__lasx_xvsub_b(q4_1, m32s), q8_1); - __m256i p16_2 = lasx_madd_h_b(__lasx_xvsub_b(q4_2, m32s), q8_2); - __m256i p16_3 = lasx_madd_h_b(__lasx_xvsub_b(q4_3, m32s), q8_3); - - p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0); - p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1); - p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2); - p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3); - - sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); - sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3)); - } - - acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); - } - - *s = hsum_float_8(acc); -#elif defined(__VXE__) || defined(__VXE2__) - float sum = 0; - - // Lower 4-bit and upper 2-bit masks - const uint8x16_t v_lm = vec_splat_u8(0x0F); - const uint8x16_t v_um = vec_splat_u8(0x03); - - const int32x4_t v_z = vec_splat_s32(0); - - int8x16_t q6b[4]; - uint8x16_t q6h[4]; - - uint8x16_t v_xl[4]; - uint8x16_t v_xh[2]; - int8x16_t v_y[4]; - - for (int i = 0; i < nb; ++i) { - const float d_all = GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * GGML_RESTRICT x0l = x[i].ql; - const uint8_t * GGML_RESTRICT x0h = x[i].qh; - const int8_t * GGML_RESTRICT y0 = y[i].qs; - - const int8_t * GGML_RESTRICT scale = x[i].scales; - - const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); - const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); - - const int8x16_t v_scale = vec_xl(0, scale); - const int16x8_t v_scalel = vec_unpackh(v_scale); - const int16x8_t v_scaleh = vec_unpackl(v_scale); - - const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel); - const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel); - const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh); - const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh); - const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe; - - const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]; - - int32_t isum = 0; - for (int j = 0; j < QK_K/128; ++j) { - // Load model upper 2 bits - v_xh[0] = vec_xl(0 , x0h); - v_xh[1] = vec_xl(16, x0h); - x0h += 32; - - // Load model lower 4 bits - v_xl[0] = vec_xl(0 , x0l); - v_xl[1] = vec_xl(16, x0l); - v_xl[2] = vec_xl(32, x0l); - v_xl[3] = vec_xl(48, x0l); - x0l += 64; - - // Load activation quants - v_y[0] = vec_xl(0 , y0); - v_y[1] = vec_xl(16, y0); - v_y[2] = vec_xl(32, y0); - v_y[3] = vec_xl(48, y0); - y0 += 64; - - q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4); - q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4); - uint8x16_t shifted = vec_sr(v_xh[0], 2); - q6h[2] = vec_sl(vec_and(v_um, shifted), 4); - shifted = vec_sr(v_xh[1], 2); - q6h[3] = vec_sl(vec_and(v_um, shifted), 4); - - q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0])); - q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1])); - q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2])); - q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3])); - - int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]); - int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]); - int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]); - int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]); - - isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] + - (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] + - (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] + - (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3]; - - scale += 4; - - - // Load activation quants - v_y[0] = vec_xl(0 , y0); - v_y[1] = vec_xl(16, y0); - v_y[2] = vec_xl(32, y0); - v_y[3] = vec_xl(48, y0); - y0 += 64; - - shifted = vec_sr(v_xh[0], 4); - q6h[0] = vec_sl(vec_and(v_um, shifted), 4); - shifted = vec_sr(v_xh[1], 4); - q6h[1] = vec_sl(vec_and(v_um, shifted), 4); - shifted = vec_sr(v_xh[0], 6); - q6h[2] = vec_sl(vec_and(v_um, shifted), 4); - shifted = vec_sr(v_xh[1], 6); - q6h[3] = vec_sl(vec_and(v_um, shifted), 4); - - q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0])); - q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1])); - q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2])); - q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3])); - - summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]); - summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]); - summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]); - summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]); - - isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] + - (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] + - (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] + - (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3]; - - scale += 4; - } - - sum += d_all * y[i].d * (isum - 32 * mins); - } - - *s = sum; -#else - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * GGML_RESTRICT q4 = x[i].ql; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * GGML_RESTRICT a = aux8; - for (int j = 0; j < QK_K; j += 128) { - for (int l = 0; l < 32; ++l) { - a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - } - a += 128; - q4 += 64; - qh += 32; - } - a = aux8; - int is = 0; - for (int j = 0; j < QK_K/16; ++j) { - int scale = x[i].scales[is++]; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} - -#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx) -static const int8_t keven_signs_q2xs[1024] = { - 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, - 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, - 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, - 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, - 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, - 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, - 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, - 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, - 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, - 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, - 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, - 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, - 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, - 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, - 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, - 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, - 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, - 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, - 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, - 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, - 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, - 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, - 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, - 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, - 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, - 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, - 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, - 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, - 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, - 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, - 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, - 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -}; -#endif - -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_iq2_xxs * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_NEON) - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - uint32_t aux32[4]; - const uint8_t * aux8 = (const uint8_t *)aux32; - - ggml_int8x16x4_t q2u; - ggml_int8x16x4_t q2s; - ggml_int8x16x4_t q8b; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - float sumf1 = 0, sumf2 = 0; - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; - q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1]))); - q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3]))); - q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9]))); - q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11]))); - q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127)))); - q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127)))); - q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 7) & 127)))); - q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127)))); - q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]); - q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]); - q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]); - q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]); - const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]); - const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]); - sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28)); - sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28)); - } - sumf += d*(sumf1 + sumf2); - } - *s = 0.25f * sumf; - -#elif defined(__AVX2__) - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - uint32_t aux32[4]; - const uint8_t * aux8 = (const uint8_t *)aux32; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; - const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); - const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); - const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], - signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); - const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127], - signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); - const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); - const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); - const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); - const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); - const uint16_t ls1 = aux32[1] >> 28; - const uint16_t ls2 = aux32[3] >> 28; - const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); - const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); - sumi1 = _mm256_add_epi32(sumi1, p1); - sumi2 = _mm256_add_epi32(sumi2, p2); - } - - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); - -#elif defined(__AVX__) - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - uint32_t aux32[4]; - const uint8_t * aux8 = (const uint8_t *)aux32; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m128i sumi1_0 = _mm_setzero_si128(); - __m128i sumi1_1 = _mm_setzero_si128(); - __m128i sumi2_0 = _mm_setzero_si128(); - __m128i sumi2_1 = _mm_setzero_si128(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; - const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); - const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]); - const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); - const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]); - const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); - const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]); - const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); - const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]); - const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0); - const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1); - const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0); - const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1); - const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); - const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); - const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); - const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); - const uint16_t ls1 = aux32[1] >> 28; - const uint16_t ls2 = aux32[3] >> 28; - const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); - const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); - const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); - const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); - sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); - sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); - sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); - sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); - } - - accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); - -#elif defined(__POWER9_VECTOR__) - const vector int v0 = vec_splats((int32_t)0); - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - for (int j = 0; j < QK_K/32; j += 2) { - __builtin_prefetch(q2, 0, 1); - __builtin_prefetch(q8, 0, 1); - - uint32_t aux32[4]; - const uint8_t * aux8 = (const uint8_t *)aux32; - - memcpy(aux32, q2, 4*sizeof(uint32_t)); - q2 += 8; - - vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])}; - vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])}; - vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])}; - vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])}; - - vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127))}; - vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))}; - vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127))}; - vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))}; - - vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0); - vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1); - vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2); - vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3); - - vector signed char q8y0 = vec_xl( 0, q8); - vector signed char q8y1 = vec_xl(16, q8); - vector signed char q8y2 = vec_xl(32, q8); - vector signed char q8y3 = vec_xl(48, q8); - q8 += 64; - - vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); - vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); - vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); - - const uint16_t ls0 = aux32[1] >> 28; - const uint16_t ls1 = aux32[3] >> 28; - - vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1)); - vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1)); - - vsumi0 = vec_msum(qv0, vscales01, vsumi0); - vsumi1 = vec_msum(qv1, vscales01, vsumi1); - vsumi2 = vec_msum(qv2, vscales23, vsumi2); - vsumi3 = vec_msum(qv3, vscales23, vsumi3); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = 0.125f * vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - uint32_t aux32[4]; - const uint8_t * aux8 = (const uint8_t *)aux32; - - __m256 accumf = (__m256)__lasx_xvldi(0); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m256i sumi1 = __lasx_xvldi(0); - __m256i sumi2 = __lasx_xvldi(0); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; - - const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); - const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); - const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], - signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); - const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127], - signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); - const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1); - const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2); - const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); - const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); - const uint16_t ls1 = aux32[1] >> 28; - const uint16_t ls2 = aux32[3] >> 28; - const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); - const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); - sumi1 = __lasx_xvadd_w(sumi1, p1); - sumi2 = __lasx_xvadd_w(sumi2, p2); - } - - accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - } - - *s = 0.125f * hsum_float_8(accumf); -//#elif defined(__VXE__) || defined(__VXE2__) -// const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; -// -// uint32_t aux32[4]; -// const uint8_t * aux8 = (const uint8_t *)aux32; -// -// float sumf = 0; -// -// for (int i = 0; i < nb; ++i) { -// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; -// const uint16_t * GGML_RESTRICT q2 = x[i].qs; -// const int8_t * GGML_RESTRICT q8 = y[i].qs; -// -// float sumf1 = 0, sumf2 = 0; -// -// for (int ib32 = 0; ib32 < QK_K/32; ib += 2) { -// int8x16_t q8b0 = vec_xl( 0, q8); -// int8x16_t qb81 = vec_xl(16, q8); -// int8x16_t q8b2 = vec_xl(32, q8); -// int8x16_t q8b3 = vec_xl(48, q8); -// q8 += 64; -// -// memcpy(aux32, q2, 4 * sizeof(uint32_t)); -// q2 += 8; -// -// int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) }; -// int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) }; -// int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) }; -// int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) }; -// -// int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) }; -// int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) }; -// int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) }; -// int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) }; -// -// q2u0 = vec_mul(q2u0, q2s0); -// q2u1 = vec_mul(q2u1, q2s1); -// q2u2 = vec_mul(q2u2, q2s2); -// q2u3 = vec_mul(q2u3, q2s3); -// -// const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1); -// const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3); -// -// sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28)); -// sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28)); -// } -// -// sumf += d * (sumf1 + sumf2); -// } -// -// *s = 0.25f * sumf; -#else - - uint32_t aux32[2]; - const uint8_t * aux8 = (const uint8_t *)aux32; - - float sumf = 0.f; - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - int32_t bsum = 0; - for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { - memcpy(aux32, q2, 2*sizeof(uint32_t)); - q2 += 4; - const uint32_t ls = 2*(aux32[1] >> 28) + 1; - int32_t sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); - const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; - for (int j = 0; j < 8; ++j) { - sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - } - bsum += sumi * ls; - } - sumf += d * bsum; - } - *s = 0.125f * sumf; -#endif -} - -void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_iq2_xs * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_NEON) - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - ggml_int8x16x4_t q2u; - ggml_int8x16x4_t q2s; - ggml_int8x16x4_t q8b; - - int32x4x4_t scales32; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint8x8_t scales8 = vld1_u8(x[i].scales); - const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf)); - const uint8x8_t scales_h = vshr_n_u8(scales8, 4); - uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h)); - scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1)); - const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales)); - const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales)); - scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1))); - scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1))); - scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2))); - scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2))); - int32x4_t sumi = vdupq_n_s32(0); - for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511)))); - q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511)))); - q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511)))); - q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511)))); - q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9)))); - q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9)))); - q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9)))); - q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9)))); - q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]); - q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]); - q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]); - q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]); - const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]); - const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]); - const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]); - const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]); - const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4)); - sumi = vmlaq_s32(sumi, p, scales32.val[ib64]); - q2 += 8; - } - sumf += d*vaddvq_s32(sumi); - } - *s = 0.125f * sumf; - -#elif defined(__AVX2__) - - const __m256i mone = _mm256_set1_epi8(1); - static const char block_sign_shuffle_mask_1[32] = { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - }; - static const char block_sign_shuffle_mask_2[32] = { - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, - 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, - }; - static const uint8_t bit_selector_mask_bytes[32] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes); - const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1); - const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2); - - static const uint8_t k_bit_helper[32] = { - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - }; - const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper); - const __m256i m511 = _mm256_set1_epi16(511); - const __m128i m4 = _mm_set1_epi8(0xf); - const __m128i m1 = _mm_set1_epi8(1); - - uint64_t aux64; - - // somewhat hacky, but gives a significant boost in performance - __m256i aux_gindex; - const uint16_t * gindex = (const uint16_t *)&aux_gindex; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(&aux64, x[i].scales, 8); - __m128i stmp = _mm_set1_epi64x(aux64); - stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); - const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); - - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { - - const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16; - aux_gindex = _mm256_and_si256(q2_data, m511); - - const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9); - const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13); - const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper); - - const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); - const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits); - - const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - - const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]], - iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]); - const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]], - iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]); - const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]], - iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]); - const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]], - iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); - - const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits); - const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1); - const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l); - const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h); - - __m256i signs; - signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1); - signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone)); - - signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2); - signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone)); - - signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1); - signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone)); - - signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2); - signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone)); - - const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); - const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); - const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3); - const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4); - - const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); - const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); - const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2))); - const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3))); - - sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); - sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); - sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3)); - sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4)); - } - - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); - -#elif defined(__AVX__) - const __m128i mone = _mm_set1_epi8(1); - static const char block_sign_shuffle_mask_1[32] = { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - }; - static const char block_sign_shuffle_mask_2[32] = { - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, - 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, - }; - static const uint8_t bit_selector_mask_bytes[32] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes); - const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1); - const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1); - const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1); - const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2); - const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1); - - static const uint8_t k_bit_helper[32] = { - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - }; - const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper); - const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1); - const __m128i m511 = _mm_set1_epi16(511); - const __m128i m4 = _mm_set1_epi8(0xf); - const __m128i m1 = _mm_set1_epi8(1); - - uint64_t aux64; - - // somewhat hacky, but gives a significant boost in performance - __m256i aux_gindex; - const uint16_t * gindex = (const uint16_t *)&aux_gindex; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(&aux64, x[i].scales, 8); - __m128i stmp = _mm_set1_epi64x(aux64); - stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); - const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); - - __m128i sumi1_0 = _mm_setzero_si128(); - __m128i sumi1_1 = _mm_setzero_si128(); - __m128i sumi2_0 = _mm_setzero_si128(); - __m128i sumi2_1 = _mm_setzero_si128(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { - - const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2); - const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16; - aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511)); - - const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9); - const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9); - const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13); - const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13); - const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0); - const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1); - - const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0); - const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1); - const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0); - const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1); - - const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - - const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); - const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]); - const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); - const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]); - const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]); - const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]); - const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); - const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]); - - // AVX2 full_signs_1 is full_sign_bits_0 here - // AVX2 full_signs_2 is full_sign_bits_1 here - __m128i signs_0, signs_1; - signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0); - signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1); - signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); - signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); - const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone)); - const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone)); - - signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0); - signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1); - signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); - signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); - const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone)); - const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone)); - - signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0); - signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1); - signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); - signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); - const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone)); - const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone)); - - signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0); - signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1); - signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); - signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); - const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone)); - const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone)); - - const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); - const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); - const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); - const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); - const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0); - const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1); - const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0); - const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1); - - __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)); - const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp); - const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); - sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)); - const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp); - const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); - sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)); - const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp); - const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); - sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)); - const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp); - const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); - - sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0)); - sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1)); - sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0)); - sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1)); - sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0)); - sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1)); - sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0)); - sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1)); - } - - accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); - -#elif defined(__loongarch_asx) - - const __m256i mone = __lasx_xvreplgr2vr_b(1); - static const char block_sign_shuffle_mask_1[32] = { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - }; - static const char block_sign_shuffle_mask_2[32] = { - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, - 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, - }; - static const uint8_t bit_selector_mask_bytes[32] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0); - const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0); - const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0); - - static const uint8_t k_bit_helper[32] = { - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - }; - const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0); - const __m256i m511 = __lasx_xvreplgr2vr_h(511); - const __m128i m4 = __lsx_vreplgr2vr_b(0xf); - const __m128i m1 = __lsx_vreplgr2vr_b(1); - - uint64_t aux64; - - // somewhat hacky, but gives a significant boost in performance - __m256i aux_gindex; - const uint16_t * gindex = (const uint16_t *)&aux_gindex; - - __m256 accumf = (__m256)__lasx_xvldi(0); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(&aux64, x[i].scales, 8); - __m128i stmp = __lsx_vreplgr2vr_d(aux64); - stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4)); - const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1); - - __m256i sumi1 = __lasx_xvldi(0); - __m256i sumi2 = __lasx_xvldi(0); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { - - const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0); q2 += 16; - aux_gindex = __lasx_xvand_v(q2_data, m511); - - const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9); - const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13); - const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper); - - const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting); - const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits); - - const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - - const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]], - iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]); - const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]], - iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]); - const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]], - iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]); - const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]], - iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); - - const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0); - const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1); - const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l); - const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h); - - __m256i signs; - signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1); - signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1); - - signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2); - signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2); - - signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1); - signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3); - - signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2); - signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4); - - const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); - const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); - const __m256i dot3 = lasx_maddubs_h(q2_3, q8s_3); - const __m256i dot4 = lasx_maddubs_h(q2_4, q8s_4); - - const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0))); - const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1))); - const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2))); - const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3))); - - sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1)); - sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2)); - sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3)); - sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4)); - } - - accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); -#elif defined(__POWER9_VECTOR__) - const vector int v0 = vec_splats((int32_t)0); - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const uint8_t * GGML_RESTRICT sc = x[i].scales; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - for (int j = 0; j < QK_K/64; ++j) { - __builtin_prefetch(q2, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))}; - vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))}; - vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))}; - vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))}; - - vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))}; - vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))}; - vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))}; - vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))}; - q2 += 8; - - vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0); - vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1); - vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2); - vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3); - - vector signed char q8y0 = vec_xl( 0, q8); - vector signed char q8y1 = vec_xl(16, q8); - vector signed char q8y2 = vec_xl(32, q8); - vector signed char q8y3 = vec_xl(48, q8); - q8 += 64; - - vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); - vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); - vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); - - const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); - const uint16_t ls1 = (uint16_t)(sc[0] >> 4); - const uint16_t ls2 = (uint16_t)(sc[1] & 0xf); - const uint16_t ls3 = (uint16_t)(sc[1] >> 4); - sc += 2; - - vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1)); - vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1)); - vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1)); - vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1)); - - vsumi0 = vec_msum(qv0, vscales0, vsumi0); - vsumi1 = vec_msum(qv1, vscales1, vsumi1); - vsumi2 = vec_msum(qv2, vscales2, vsumi2); - vsumi3 = vec_msum(qv3, vscales3, vsumi3); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = 0.125f * vec_extract(vsumf0, 0); -#else - - float sumf = 0.f; - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint16_t * GGML_RESTRICT q2 = x[i].qs; - const uint8_t * GGML_RESTRICT sc = x[i].scales; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - int32_t bsum = 0; - for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { - const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; - const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; - int32_t sumi = 0; - for (int l = 0; l < 2; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); - const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; - for (int j = 0; j < 8; ++j) { - sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - } - bsum += sumi * ls1; - sumi = 0; - for (int l = 2; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); - const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; - for (int j = 0; j < 8; ++j) { - sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - } - bsum += sumi * ls2; - q2 += 4; - } - sumf += d * bsum; - } - *s = 0.125f * sumf; -#endif -} - -void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_iq2_s * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_NEON) - - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; - - const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1); - const uint8x16_t mask2 = vld1q_u8(k_mask2); - const uint8x16_t m1 = vdupq_n_u8(1); - const int32x4_t vzero = vdupq_n_s32(0); - - uint8x16x2_t vs; - ggml_int8x16x4_t q2s; - ggml_int8x16x4_t q8b; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - int sumi1 = 0, sumi2 = 0; - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))), - vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300))))); - q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))), - vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300))))); - q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))), - vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300))))); - q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))), - vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300))))); - qs += 8; - - vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16))); - vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); - vs.val[0] = vceqq_u8(vs.val[0], mask2); - vs.val[1] = vceqq_u8(vs.val[1], mask2); - - q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]); - q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]); - - vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16))); - vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); - vs.val[0] = vceqq_u8(vs.val[0], mask2); - vs.val[1] = vceqq_u8(vs.val[1], mask2); - - signs += 4; - - q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]); - q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]); - - const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]); - const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]); - const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]); - const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]); - - sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf)); - sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >> 4)); - sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf)); - sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >> 4)); - } - sumf += d*(sumi1 + sumi2); - } - - *s = 0.125f * sumf; - -#elif defined(__AVX2__) - - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - const __m128i m4 = _mm_set1_epi8(0xf); - const __m128i m1 = _mm_set1_epi8(1); - - const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); - const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); - - uint64_t aux64; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(&aux64, x[i].scales, 8); - const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); - const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 - - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], - iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)], - iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], - iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); - const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], - iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)], - iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], - iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); - qs += 8; - - __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); - const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); - const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); - - aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); - const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); - const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); - - signs += 4; - - const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1 - const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3 - - const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0))); - const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1))); - sumi1 = _mm256_add_epi32(sumi1, p1); - sumi2 = _mm256_add_epi32(sumi2, p2); - } - - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); - -#elif defined(__AVX__) - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - const __m128i m4 = _mm_set1_epi8(0xf); - const __m128i m1 = _mm_set1_epi8(1); - - const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1); - const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1); - const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2); - const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1); - - uint64_t aux64; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(&aux64, x[i].scales, 8); - const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); - const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8); - const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8)); - - __m128i sumi1_0 = _mm_setzero_si128(); - __m128i sumi1_1 = _mm_setzero_si128(); - __m128i sumi2_0 = _mm_setzero_si128(); - __m128i sumi2_1 = _mm_setzero_si128(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], - iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); - const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], - iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]); - const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], - iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); - const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], - iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]); - qs += 8; - - __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16)); - __m128i aux128_1 = aux128_0; - aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); - aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); - const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); - const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); - const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0); - const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1); - - aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16)); - aux128_1 = aux128_0; - aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); - aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); - const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); - const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); - const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0); - const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1); - - signs += 4; - - const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); - const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); - const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); - const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); - - const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0))); - const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1))); - const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0))); - const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1))); - sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); - sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); - sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); - sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); - } - - accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); - -#elif defined(__POWER9_VECTOR__) - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; - - const vector int v0 = vec_splats((int32_t)0); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - const vector unsigned char mask0 = vec_xl( 0, k_mask1); - const vector unsigned char mask1 = vec_xl(16, k_mask1); - const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - - const uint8_t * GGML_RESTRICT q2 = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); - const uint8_t * GGML_RESTRICT sc = x[i].scales; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - for (int j = 0; j < QK_K/32; j += 2) { - __builtin_prefetch(q2, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))}; - vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))}; - vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))}; - vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))}; - q2 += 8; - qh += 2; - - vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]); - vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]); - signs += 4; - - vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0); - vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1); - vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0); - vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1); - - vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2); - vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2); - vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2); - vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2); - - vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0); - vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1); - vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2); - vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3); - - vector signed char q8y0 = vec_xl( 0, q8); - vector signed char q8y1 = vec_xl(16, q8); - vector signed char q8y2 = vec_xl(32, q8); - vector signed char q8y3 = vec_xl(48, q8); - q8 += 64; - - vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1)); - vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2)); - vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3)); - - const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); - const uint16_t ls1 = (uint16_t)(sc[0] >> 4); - const uint16_t ls2 = (uint16_t)(sc[1] & 0xf); - const uint16_t ls3 = (uint16_t)(sc[1] >> 4); - sc += 2; - - vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1)); - vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1)); - vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1)); - vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1)); - - vsumi0 = vec_msum(qv0, vscales0, vsumi0); - vsumi1 = vec_msum(qv1, vscales1, vsumi1); - vsumi2 = vec_msum(qv2, vscales2, vsumi2); - vsumi3 = vec_msum(qv3, vscales3, vsumi3); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = 0.125f * vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - - const __m128i m4 = __lsx_vreplgr2vr_b(0xf); - const __m128i m1 = __lsx_vreplgr2vr_b(1); - - const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0); - const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0); - uint64_t aux64; - - __m256 accumf = (__m256)__lasx_xvldi(0); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - __m128i tmp1; - memcpy(&aux64, x[i].scales, 8); - tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0); - tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1); - const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1); - const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 - - __m256i sumi1 = __lasx_xvldi(0); - __m256i sumi2 = __lasx_xvldi(0); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], - iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)], - iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], - iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); - const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], - iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)], - iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], - iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); - qs += 8; - - __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16)); - aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); - const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2); - const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1); - - aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16)); - aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); - const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2); - const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2); - - signs += 4; - - const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1 - const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3 - - const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0))); - const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1))); - sumi1 = __lasx_xvadd_w(sumi1, p1); - sumi2 = __lasx_xvadd_w(sumi2, p2); - } - - accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - } - - *s = 0.125f * hsum_float_8(accumf); - -#else - - float sumf = 0; - for (int i = 0; i < nb; i++) { - - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint8_t * qh = x[i].qh; - const uint8_t * signs = qs + QK_K/8; - - int bsum = 0; - for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { - int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf); - int ls2 = 1 + 2*(x[i].scales[ib32] >> 4); - int sumi1 = 0, sumi2 = 0; - for (int l = 0; l < 2; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); - for (int j = 0; j < 8; ++j) { - sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - } - for (int l = 2; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); - for (int j = 0; j < 8; ++j) { - sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - } - bsum += ls1 * sumi1 + ls2 * sumi2; - qs += 4; - signs += 4; - } - - sumf += d * bsum; - } - - *s = 0.125f * sumf; - -#endif - -} - -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_iq3_xxs * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_NEON) - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - uint32_t aux32[2]; - - ggml_int8x16x4_t q3s; - ggml_int8x16x4_t q8b; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - float sumf1 = 0, sumf2 = 0; - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t); - const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]); - const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]); - const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]); - const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]); - q3 += 16; - q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127)))); - q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127)))); - q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127)))); - q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127)))); - q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0)); - q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1)); - q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2)); - q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3)); - const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); - const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); - sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28)); - sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28)); - } - sumf += d*(sumf1 + sumf2); - } - *s = 0.5f * sumf; - -#elif defined(__AVX2__) - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - uint32_t aux32[2]; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], - iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); - q3 += 8; - const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], - iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); - q3 += 8; - memcpy(aux32, gas, 8); gas += 8; - const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127], - signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); - const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], - signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); - const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); - const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); - const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); - const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); - const uint16_t ls1 = aux32[0] >> 28; - const uint16_t ls2 = aux32[1] >> 28; - const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); - const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); - sumi1 = _mm256_add_epi32(sumi1, p1); - sumi2 = _mm256_add_epi32(sumi2, p2); - } - - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); - - } - - *s = 0.25f * hsum_float_8(accumf); - -#elif defined(__AVX__) - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - uint32_t aux32[2]; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m128i sumi1_0 = _mm_setzero_si128(); - __m128i sumi1_1 = _mm_setzero_si128(); - __m128i sumi2_0 = _mm_setzero_si128(); - __m128i sumi2_1 = _mm_setzero_si128(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); - const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]); - q3 += 8; - const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); - const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]); - q3 += 8; - memcpy(aux32, gas, 8); gas += 8; - const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); - const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]); - const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); - const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]); - const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0); - const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1); - const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0); - const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1); - const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); - const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); - const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); - const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); - const uint16_t ls1 = aux32[0] >> 28; - const uint16_t ls2 = aux32[1] >> 28; - const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); - const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); - const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); - const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); - sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); - sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); - sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); - sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); - } - - accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); - - } - - *s = 0.25f * hsum_float_8(accumf); - -#elif defined(__POWER9_VECTOR__) - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - const vector int v0 = vec_splats((int32_t)0); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4); - const int8_t * GGML_RESTRICT q8 = y[i].qs; - -#pragma GCC unroll 1 - for (int j = 0; j < QK_K/32; j += 2) { - __builtin_prefetch(q3, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]}; - vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]}; - vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]}; - vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]}; - q3 += 16; - - vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >> 0) & 127]), (uint64_t)(signs64[(signs[0] >> 7) & 127])}; - vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])}; - vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >> 0) & 127]), (uint64_t)(signs64[(signs[1] >> 7) & 127])}; - vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])}; - - vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0); - vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1); - vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2); - vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3); - - vector signed char q8y0 = vec_xl( 0, q8); - vector signed char q8y1 = vec_xl(16, q8); - vector signed char q8y2 = vec_xl(32, q8); - vector signed char q8y3 = vec_xl(48, q8); - q8 += 64; - - vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1)); - vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2)); - vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3)); - - const uint16_t ls0 = (uint16_t)(signs[0] >> 28); - const uint16_t ls1 = (uint16_t)(signs[1] >> 28); - signs += 2; - - vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); - vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); - - vsumi0 = vec_msum(qv0, vscales01, vsumi0); - vsumi1 = vec_msum(qv1, vscales01, vsumi1); - vsumi2 = vec_msum(qv2, vscales23, vsumi2); - vsumi3 = vec_msum(qv3, vscales23, vsumi3); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = 0.25f * vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - - uint32_t aux32[2]; - - __m256 accumf = (__m256)__lasx_xvldi(0); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m256i sumi1 = __lasx_xvldi(0); - __m256i sumi2 = __lasx_xvldi(0); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], - iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); - q3 += 8; - const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], - iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); - q3 += 8; - memcpy(aux32, gas, 8); gas += 8; - - const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127], - signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); - const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], - signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); - const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1); - const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2); - const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); - const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); - const uint16_t ls1 = aux32[0] >> 28; - const uint16_t ls2 = aux32[1] >> 28; - - const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); - const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); - sumi1 = __lasx_xvadd_w(sumi1, p1); - sumi2 = __lasx_xvadd_w(sumi2, p2); - } - - accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - } - - *s = 0.25f * hsum_float_8(accumf); - -#else - - uint32_t aux32; - - float sumf = 0.f; - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - int32_t bsum = 0; - for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { - memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); - const uint32_t ls = 2*(aux32 >> 28) + 1; - int32_t sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]); - const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]); - const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; - for (int j = 0; j < 4; ++j) { - sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1); - sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1); - } - q8 += 8; - } - q3 += 8; - bsum += sumi * ls; - } - sumf += d * bsum; - } - *s = 0.25f * sumf; -#endif -} - -void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_iq3_s * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined(__ARM_NEON) - - typedef union { - uint16x8_t vec_index; - uint16_t index[8]; - } vec_index_t; - - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; - - static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1}; - - const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1); - const uint8x16_t mask2 = vld1q_u8(k_mask2); - - const int16x8_t hshift = vld1q_s16(k_shift); - const uint16x8_t m256 = vdupq_n_u16(256); - const uint8x16_t m1 = vdupq_n_u8(1); - - uint8x16x2_t vs; - ggml_int8x16x4_t q3s; - ggml_int8x16x4_t q8b; - vec_index_t idx; - - uint32_t scales32[2]; - const uint8_t * scales8 = (const uint8_t *)scales32; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - memcpy(scales32, x[i].scales, 4); - scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; - scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101; - - int sumi1 = 0, sumi2 = 0; - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - - const uint8x16_t idx_l = vld1q_u8(qs); qs += 16; - idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256)); - const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], - iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]); - const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], - iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); - idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256)); - const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], - iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]); - const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], - iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); - - - vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16))); - vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); - vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); - vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); - - q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0)); - q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1)); - - vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16))); - vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); - vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); - vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); - - signs += 4; - - q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2)); - q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3)); - - const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); - const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); - - sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0]; - sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4]; - } - sumf += d*(sumi1 + sumi2); - } - *s = sumf; - -#elif defined(__AVX2__) - - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); - const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); - - const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); - const __m256i idx_mask = _mm256_set1_epi32(256); - - typedef union { - __m256i vec[2]; - uint32_t index[16]; - } index_t; - - index_t idx; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16; - idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]); - idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]); - idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask); - idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask); - idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l))); - idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1))); - - // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange. - //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4); - //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); - const __m256i q2_1 = _mm256_set_epi32( - iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], - iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] - ); - const __m256i q2_2 = _mm256_set_epi32( - iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], - iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]] - ); - - __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); - const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); - const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); - - aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16)); - aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); - const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); - const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); - - signs += 4; - - const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); - const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); - const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; - const uint16_t ls2 = x[i].scales[ib32/2] >> 4; - const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); - const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); - sumi1 = _mm256_add_epi32(sumi1, p1); - sumi2 = _mm256_add_epi32(sumi2, p2); - } - - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); - - } - - *s = hsum_float_8(accumf); - -#elif defined(__AVX__) - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1); - const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1); - const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2); - const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1); - - const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256); - const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16); - const __m128i idx_mask = _mm_set1_epi32(256); - - typedef union { - __m128i vec[4]; - uint32_t index[16]; - } index_t; - - index_t idx; - - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m128i sumi1_0 = _mm_setzero_si128(); - __m128i sumi1_1 = _mm_setzero_si128(); - __m128i sumi2_0 = _mm_setzero_si128(); - __m128i sumi2_1 = _mm_setzero_si128(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs); - const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp); - const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16; - idx.vec[0] = _mm_set1_epi32(qh[ib32+0]); - idx.vec[1] = idx.vec[0]; - idx.vec[2] = _mm_set1_epi32(qh[ib32+1]); - idx.vec[3] = idx.vec[2]; - - idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask); - idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask); - idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask); - idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask); - - idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0)); - idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8))); - idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1)); - idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8))); - - const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]); - const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]); - const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]); - const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]); - - __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16)); - __m128i aux128_1 = aux128_0; - aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); - aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); - const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); - const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); - const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0); - const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1); - - aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16)); - aux128_1 = aux128_0; - aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); - aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); - const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); - const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); - const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0); - const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1); - - signs += 4; - - const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); - const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); - const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); - const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); - const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; - const uint16_t ls2 = x[i].scales[ib32/2] >> 4; - const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); - const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); - const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); - const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); - sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); - sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); - sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); - sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); - } - - accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); - - } - - *s = hsum_float_8(accumf); - -#elif defined(__POWER9_VECTOR__) - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; - - const vector int v0 = vec_splats((int32_t)0); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - const vector unsigned char mask0 = vec_xl( 0, k_mask1); - const vector unsigned char mask1 = vec_xl(16, k_mask1); - const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - const uint8_t * GGML_RESTRICT q3 = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs); - const uint8_t * GGML_RESTRICT sc = x[i].scales; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - - for (int j = 0; j < QK_K/32; j += 2) { - __builtin_prefetch(q3, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)], - iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]}; - vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)], - iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]}; - vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)], - iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]}; - vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)], - iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]}; - q3 += 16; - qh += 2; - - vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]); - vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]); - signs += 4; - - vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0); - vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1); - vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0); - vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1); - - vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2); - vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2); - vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2); - vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2); - - vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0); - vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1); - vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2); - vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3); - - vector signed char q8y0 = vec_xl( 0, q8); - vector signed char q8y1 = vec_xl(16, q8); - vector signed char q8y2 = vec_xl(32, q8); - vector signed char q8y3 = vec_xl(48, q8); - q8 += 64; - - vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1)); - vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2)); - vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3)); - - const uint16_t ls0 = (uint16_t)(sc[0] & 0xf); - const uint16_t ls1 = (uint16_t)(sc[0] >> 4); - sc ++; - - vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); - vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); - - vsumi0 = vec_msum(qv0, vscales01, vsumi0); - vsumi1 = vec_msum(qv1, vscales01, vsumi1); - vsumi2 = vec_msum(qv2, vscales23, vsumi2); - vsumi3 = vec_msum(qv3, vscales23, vsumi3); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - - static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 - }; - - static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - }; - - const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0); - const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0); - - __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8); - const __m256i idx_mask = __lasx_xvreplgr2vr_w(256); - - typedef union { - __m256i vec[2]; - uint32_t index[16]; - } index_t; - - index_t idx; - - __m256 accumf = (__m256)__lasx_xvldi(0); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - __m256i sumi1 = __lasx_xvldi(0); - __m256i sumi2 = __lasx_xvldi(0); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16; - idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]); - idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]); - idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask); - idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask); - idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0))); - idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1))); - - // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange. - //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4); - //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); - const __m256i q2_1 = lasx_set_w( - iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], - iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] - ); - const __m256i q2_2 = lasx_set_w( - iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], - iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]] - ); - - __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16)); - aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); - const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2); - const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1); - - aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16)); - aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2); - const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2); - const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2); - - signs += 4; - - const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); - const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); - const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; - const uint16_t ls2 = x[i].scales[ib32/2] >> 4; - const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1)); - const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1)); - sumi1 = __lasx_xvadd_w(sumi1, p1); - sumi2 = __lasx_xvadd_w(sumi2, p2); - } - - accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - } - - *s = hsum_float_8(accumf); - -#else - - float sumf = 0.f; - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * GGML_RESTRICT qs = x[i].qs; - const uint8_t * GGML_RESTRICT qh = x[i].qh; - const uint8_t * GGML_RESTRICT signs = x[i].signs; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - int32_t bsum = 0; - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; - const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; - int32_t sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); - for (int j = 0; j < 4; ++j) { - sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); - sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); - } - q8 += 8; - } - qs += 8; - signs += 4; - bsum += sumi * ls1; - sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); - for (int j = 0; j < 4; ++j) { - sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); - sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); - } - q8 += 8; - } - qs += 8; - signs += 4; - bsum += sumi * ls2; - } - sumf += d * bsum; - } - *s = sumf; -#endif -} - -#if defined(__AVX2__) -static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { - const __m256i ax = _mm256_sign_epi8(x, x); - const __m256i sy = _mm256_sign_epi8(y, x); - return _mm256_maddubs_epi16(ax, sy); -} -#elif defined(__loongarch_asx) -static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { - const __m256i a = __lasx_xvmulwev_h_b(x, y); - const __m256i b = __lasx_xvmulwod_h_b(x, y); - return __lasx_xvadd_h(a, b); -} -#endif - -void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_iq1_s * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined __ARM_NEON - - ggml_int8x16x4_t q1b; - ggml_int8x16x4_t q8b; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint16_t * qh = x[i].qh; - - int sumi1 = 0, sumi2 = 0, sumi3 = 0; - - for (int ib = 0; ib < QK_K/32; ib += 2) { - - q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))), - vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700))))); - q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))), - vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700))))); - q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))), - vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700))))); - q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))), - vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700))))); - qs += 8; - - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - - const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]); - const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]); - - const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; - const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; - sumi1 += vaddvq_s32(p1) * ls1; - sumi2 += vaddvq_s32(p2) * ls2; - sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1) - + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1); - - } - - sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3); - } - - *s = sumf; - -#elif defined __AVX2__ - - __m256 accum = _mm256_setzero_ps(); - float accum1 = 0; - for (int i = 0; i < nb; ++i) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint16_t * qh = x[i].qh; - - __m256i sumi = _mm256_setzero_si256(); - int sumi1 = 0; - for (int ib = 0; ib < QK_K/32; ib += 2) { -#ifdef __BMI2__ - const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL); - const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL); - const uint16_t *idx1 = (const uint16_t *)(&packed_idx1); - const uint16_t *idx2 = (const uint16_t *)(&packed_idx2); - const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]); - const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]); -#else - const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], - iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]); - const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], - iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]); -#endif - qs += 8; - const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - - const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); - const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); - const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; - const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; - const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1)); - const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2)); - - sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2)); - sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 - + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; - } - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum); - accum1 += d * sumi1; - - } - - *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; - -#elif defined __AVX__ - __m256 accum = _mm256_setzero_ps(); - float accum1 = 0; - for (int i = 0; i < nb; ++i) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint16_t * qh = x[i].qh; - - __m128i sumi1_0 = _mm_setzero_si128(); - __m128i sumi1_1 = _mm_setzero_si128(); - int sumi1 = 0; - for (int ib = 0; ib < QK_K/32; ib += 2) { - const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]); - const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]); - const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]); - const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]); - qs += 8; - const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - - const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0); - const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1); - const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0); - const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1); - const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; - const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; - const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1)); - const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1)); - const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2)); - const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2)); - - sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0)); - sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1)); - sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 - + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; - } - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum); - accum1 += d * sumi1; - - } - - *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; - -#elif defined(__POWER9_VECTOR__) - const vector unsigned char v0 = vec_splats((unsigned char)0x0); - const vector unsigned short vsign = vec_splats((unsigned short)0x8000); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi8 = vec_splats((int32_t)0); - - const uint8_t * GGML_RESTRICT q1 = x[i].qs; - const uint16_t * GGML_RESTRICT qh = x[i].qh; - const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int16_t * GGML_RESTRICT qs = y[i].bsums; - - for (int j = 0; j < QK_K/32; j += 2) { - __builtin_prefetch(q1, 0, 1); - __builtin_prefetch(qh, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))}; - vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))}; - vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))}; - vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))}; - q1 += 8; - - vector signed char q1x0 = (vector signed char)aux64x2_0; - vector signed char q1x1 = (vector signed char)aux64x2_1; - vector signed char q1x2 = (vector signed char)aux64x2_2; - vector signed char q1x3 = (vector signed char)aux64x2_3; - - vector signed char q8y0 = vec_xl( 0, q8); - vector signed char q8y1 = vec_xl(16, q8); - vector signed char q8y2 = vec_xl(32, q8); - vector signed char q8y3 = vec_xl(48, q8); - q8 += 64; - - vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1)); - vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2)); - vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3)); - - const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7); - const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7); - - vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); - vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); - vector signed short vscales = vec_sld(vscales23, vscales01, 8); - - vsumi0 = vec_msum(qv0, vscales01, vsumi0); - vsumi1 = vec_msum(qv1, vscales01, vsumi1); - vsumi2 = vec_msum(qv2, vscales23, vsumi2); - vsumi3 = vec_msum(qv3, vscales23, vsumi3); - - vector signed short q8ysums = vec_xl_len(qs, 8); - qs += 4; - q8ysums = vec_mergeh(q8ysums, (vector signed short)v0); - - vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8); - qh += 2; - vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0); - - vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel); - - vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - - vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - - __m256 accum = (__m256)__lasx_xvldi(0); - float accum1 = 0; - for (int i = 0; i < nb; ++i) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint16_t * qh = x[i].qh; - - __m256i sumi = __lasx_xvldi(0); - int sumi1 = 0; - for (int ib = 0; ib < QK_K/32; ib += 2) { - __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0); - q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1); - q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2); - q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3); - - __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0); - q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1); - q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2); - q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3); - - qs += 8; - const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32; - - const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); - const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); - const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; - const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; - - __m256i tmp1, tmp5, tmp6; - tmp1 = __lasx_xvreplgr2vr_h(ls1); - tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1); - tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1); - const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6); - - tmp1 = __lasx_xvreplgr2vr_h(ls2); - tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1); - tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1); - const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6); - - sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2)); - sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 - + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; - } - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum); - accum1 += d * sumi1; - } - - *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; - -#else - - float sumf = 0; - for (int i = 0; i < nb; i++) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint16_t * qh = x[i].qh; - - int sumi = 0, sumi1 = 0; - for (int ib = 0; ib < QK_K/32; ++ib) { - const int ls = 2*((qh[ib] >> 12) & 7) + 1; - const int delta = qh[ib] & 0x8000 ? -1 : 1; - int lsum = 0; - for (int l = 0; l < 4; ++l) { - const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8))); - for (int j = 0; j < 8; ++j) { - lsum += q8[j] * grid[j]; - } - q8 += 8; - } - sumi += ls * lsum; - sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]); - qs += 4; - } - - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); - } - - *s = sumf; - -#endif -} - -void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_iq1_m * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - - iq1m_scale_t scale; - -#if defined __ARM_NEON - const int32x4_t mask = vdupq_n_s32(0x7); - const int32x4_t mone = vdupq_n_s32(1); - const int32x4_t mzero = vdupq_n_s32(0); - - ggml_int8x16x4_t deltas; - deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1)); - deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1)); - deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1)); - deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1)); - - ggml_int8x16x4_t q1b; - ggml_int8x16x4_t q8b; - - uint32_t aux32; - const uint8_t * aux8 = (const uint8_t *)&aux32; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint8_t * qh = x[i].qh; - const uint16_t * sc = (const uint16_t *)x[i].scales; - - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - - int32x4_t sumi1 = mzero; - int32x4_t sumi2 = mzero; - - for (int ib = 0; ib < QK_K/32; ib += 2) { - - q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))), - vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700))))); - q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))), - vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700))))); - q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))), - vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700))))); - q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))), - vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700))))); - - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - - const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1])); - const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3])); - const int32x4_t p12 = vpaddq_s32(p1, p2); - - const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that - aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202); - - const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1])); - const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3])); - const int32x4_t p34 = vpaddq_s32(p3, p4); - - int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9); - - scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone); - - sumi1 = vmlaq_s32(sumi1, scales_4, p12); - sumi2 = vmlaq_s32(sumi2, scales_4, p34); - - qs += 8; qh += 4; - - } - - sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); - } - - *s = sumf; - -#elif defined __AVX2__ - - const __m256i mask = _mm256_set1_epi16(0x7); - const __m256i mone = _mm256_set1_epi16(1); - const __m256i mone8 = _mm256_set1_epi8(1); - const __m256i mtwo8 = _mm256_set1_epi8(2); - // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half. - const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0); - - __m256 accum1 = _mm256_setzero_ps(); - __m256 accum2 = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint8_t * qh = x[i].qh; - const uint16_t * sc = (const uint16_t *)x[i].scales; - - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - // Extract 3-bit scales (16 values) - __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc); - scales = _mm256_srlv_epi64(scales, scales_shift); - scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone); - - // Indices to repeat each scale 8 times. - __m256i scales_idx1 = _mm256_set1_epi16(0x0100); - __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8)); - - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); - for (int ib = 0; ib < QK_K/32; ib += 2) { -#ifdef __BMI2__ - const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) - | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL); - const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) - | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL); - const uint16_t *idx1 = (const uint16_t *)(&packed_idx1); - const uint16_t *idx2 = (const uint16_t *)(&packed_idx2); - const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]); - const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]); - - // Convert signs to bytes 0x81 (negative) or 0x01 (positive) - const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL); - const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign))); - const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32))); -#else - const __m256i q1b_1 = _mm256_set_epi64x( - iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)], - iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)] - ); - const __m256i q1b_2 = _mm256_set_epi64x( - iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)], - iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)] - ); - - const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, - qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, - qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, - qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); - const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, - qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, - qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, - qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); -#endif - const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - - const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); - const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); - const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1)); - const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2)); - - __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1); - __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2); - - scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8); - scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8); - - const __m256i p1 = _mm256_madd_epi16(dot1, scale1); - const __m256i p2 = _mm256_madd_epi16(dot2, scale2); - const __m256i p3 = _mm256_madd_epi16(dot3, scale1); - const __m256i p4 = _mm256_madd_epi16(dot4, scale2); - - sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2)); - sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4)); - - qs += 8; qh += 4; - } - - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); - - accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); - accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); - } - - *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); - -#elif defined __AVX__ - const __m128i mask = _mm_set1_epi16(0x7); - const __m128i mone = _mm_set1_epi16(1); - - __m256 accum1 = _mm256_setzero_ps(); - __m256 accum2 = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint8_t * qh = x[i].qh; - const uint16_t * sc = (const uint16_t *)x[i].scales; - - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - - __m128i sumi1_0 = _mm_setzero_si128(); - __m128i sumi1_1 = _mm_setzero_si128(); - __m128i sumi2_0 = _mm_setzero_si128(); - __m128i sumi2_1 = _mm_setzero_si128(); - for (int ib = 0; ib < QK_K/32; ib += 2) { - const __m128i q1b_1_0 = _mm_set_epi64x( - iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]); - const __m128i q1b_1_1 = _mm_set_epi64x( - iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]); - const __m128i q1b_2_0 = _mm_set_epi64x( - iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]); - const __m128i q1b_2_1 = _mm_set_epi64x( - iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]); - const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - - const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0); - const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1); - const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0); - const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1); - - const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, - qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); - const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, - qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); - const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, - qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); - const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, - qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); - - const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0); - const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1); - const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0); - const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1); - - __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0); - __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3); - __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6); - __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9); - - scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone); - scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone); - scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone); - scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone); - const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0); - const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1); - const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0); - const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1); - const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0); - const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1); - const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0); - const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1); - - sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0)); - sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1)); - sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0)); - sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1)); - - qs += 8; qh += 4; - } - - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); - - accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1); - accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2); - } - - *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); - -#else - - int sum1[2], sum2[2], delta[4]; - - float sumf = 0; - for (int i = 0; i < nb; i++) { - - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint8_t * qh = x[i].qh; - const uint16_t * sc = (const uint16_t *)x[i].scales; - - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - - int sumi1 = 0, sumi2 = 0; - for (int ib = 0; ib < QK_K/32; ++ib) { - delta[0] = qh[0] & 0x08 ? -1 : 1; - delta[1] = qh[0] & 0x80 ? -1 : 1; - delta[2] = qh[1] & 0x08 ? -1 : 1; - delta[3] = qh[1] & 0x80 ? -1 : 1; - sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0; - for (int l = 0; l < 4; ++l) { - const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700))); - int lsum1 = 0, lsum2 = 0; - for (int j = 0; j < 8; ++j) { - lsum1 += q8[j] * grid[j]; - lsum2 += q8[j]; - } - q8 += 8; - sum1[l/2] += lsum1; - sum2[l/2] += lsum2*delta[l]; - } - - const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1; - const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1; - - sumi1 += sum1[0] * ls1 + sum1[1] * ls2; - sumi2 += sum2[0] * ls1 + sum2[1] * ls2; - qs += 4; - qh += 2; - } - - sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); - } - - *s = sumf; - -#endif -} - -void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - assert(n % QK4_NL == 0); - static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); - - const block_iq4_nl * GGML_RESTRICT x = vx; - const block_q8_0 * GGML_RESTRICT y = vy; - - const int nb = n / QK4_NL; - - int ib = 0; - float sumf = 0; - -#if defined __ARM_NEON - const int8x16_t values = vld1q_s8(kvalues_iq4nl); - const uint8x16_t m4b = vdupq_n_u8(0x0f); - uint8x16x2_t q4bits; - int8x16x4_t q4b; - int8x16x4_t q8b; - int32x4_t prod_1, prod_2; - - for (; ib + 1 < nb; ib += 2) { - - q4bits.val[0] = vld1q_u8(x[ib + 0].qs); - q4bits.val[1] = vld1q_u8(x[ib + 1].qs); - q8b.val[0] = vld1q_s8(y[ib + 0].qs); - q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16); - q8b.val[2] = vld1q_s8(y[ib + 1].qs); - q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16); - - q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b)); - q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); - q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b)); - q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); - - prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); - prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); - - sumf += - GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) + - GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2); - } - -#elif defined __AVX2__ - - const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); - const __m128i m4b = _mm_set1_epi8(0x0f); - const __m256i mone = _mm256_set1_epi16(1); - - __m256 accum1 = _mm256_setzero_ps(); - __m256 accum2 = _mm256_setzero_ps(); - for (; ib + 1 < nb; ib += 2) { - const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); - const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); - const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs); - const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs); - const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), - _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); - const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), - _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); - const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); - const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); - const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); - const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); - accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), - _mm256_cvtepi32_ps(p_1), accum1); - accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), - _mm256_cvtepi32_ps(p_2), accum2); - } - - sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); - -#elif defined __AVX__ - const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); - const __m128i m4b = _mm_set1_epi8(0x0f); - - __m256 accum = _mm256_setzero_ps(); - for (; ib + 1 < nb; ib += 2) { - const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); - const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); - const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); - const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); - const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); - const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); - - const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); - const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); - const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); - const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); - - const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1); - const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); - accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); - } - - sumf = hsum_float_8(accum); - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector signed int v0 = vec_splats((int32_t)0); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - - const vector signed char values = vec_xl( 0, kvalues_iq4nl); - -#pragma GCC unroll 4 - for (; ib < nb; ++ib) { - __builtin_prefetch(x[ib].qs, 0, 1); - __builtin_prefetch(y[ib].qs, 0, 1); - - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); - vector float vd = vec_mul(vxd, vyd); - - vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); - vector signed char q4x0 = vec_and(qxs, lowMask); - vector signed char q4x1 = vec_sr(qxs, v4); - - q4x0 = vec_perm(values, values, (vector unsigned char)q4x0); - q4x1 = vec_perm(values, values, (vector unsigned char)q4x1); - - vector signed char q8y0 = vec_xl( 0, y[ib].qs); - vector signed char q8y1 = vec_xl(16, y[ib].qs); - - vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - - vsumi0 = vec_sum4s(qv0, vsumi0); - vsumi1 = vec_sum4s(qv1, vsumi1); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - } - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - sumf = vec_extract(vsumf0, 0); - -#elif defined (__loongarch_asx) - - const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0); - const __m128i m4b = __lsx_vreplgr2vr_b(0x0f); - const __m256i mone = __lasx_xvreplgr2vr_h(1); - - __m256 accum1 = (__m256)__lasx_xvldi(0); - __m256 accum2 = (__m256)__lasx_xvldi(0); - for (; ib + 1 < nb; ib += 2) { - const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0); - const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0); - const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0); - const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0); - const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)), - lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b))); - const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)), - lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b))); - const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); - const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); - const __m256i p_1 = lasx_madd_h(p16_1, mone); - const __m256i p_2 = lasx_madd_h(p16_2, mone); - accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), - __lasx_xvffint_s_w(p_1), accum1); - accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), - __lasx_xvffint_s_w(p_2), accum2); - } - - sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2)); - -#elif defined(__VXE__) || defined(__VXE2__) - const int8x16_t v_k = vec_xl(0, kvalues_iq4nl); - const uint8x16_t v_m = vec_splat_u8(0x0F); - - for (; ib < nb; ++ib) { - const block_iq4_nl * GGML_RESTRICT x0 = &x[ib]; - const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; - - const uint8x16_t v_x = vec_xl(0, x0->qs); - int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); - int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); - - v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl); - v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh); - - const int8x16_t v_yl = vec_xl(0 , y0->qs); - const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs); - const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); - - sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]); - } -#endif - for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < QK4_NL/2; ++j) { - sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; - sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; - } - sumf += d * (sumi1 + sumi2); - } - *s = sumf; -} - -void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - assert(n % QK_K == 0); - - const block_iq4_xs * GGML_RESTRICT x = vx; - const block_q8_K * GGML_RESTRICT y = vy; - - const int nb = n / QK_K; - -#if defined __ARM_NEON - const int8x16_t values = vld1q_s8(kvalues_iq4nl); - const uint8x16_t m4b = vdupq_n_u8(0x0f); - ggml_uint8x16x2_t q4bits; - ggml_int8x16x4_t q4b; - ggml_int8x16x4_t q8b; - int32x4_t prod_1, prod_2; - - float sumf = 0; - - for (int ibl = 0; ibl < nb; ++ibl) { - - const int8_t * q8 = y[ibl].qs; - const uint8_t * q4 = x[ibl].qs; - uint16_t h = x[ibl].scales_h; - - int sumi1 = 0, sumi2 = 0; - for (int ib = 0; ib < QK_K/64; ++ib) { - - q4bits = ggml_vld1q_u8_x2(q4); q4 += 32; - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - - q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b)); - q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); - q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b)); - q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); - - prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); - prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); - - int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32; - int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32; - h >>= 4; - sumi1 += vaddvq_s32(prod_1) * ls1; - sumi2 += vaddvq_s32(prod_2) * ls2; - - } - - sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); - } - - *s = sumf; - -#elif defined __AVX2__ - - const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); - const __m128i m4b = _mm_set1_epi8(0x0f); - - __m256 accum = _mm256_setzero_ps(); - for (int ibl = 0; ibl < nb; ++ibl) { - const uint8_t * qs = x[ibl].qs; - const int8_t * q8 = y[ibl].qs; - uint16_t sh = x[ibl].scales_h; - __m256i sumi1 = _mm256_setzero_si256(); - __m256i sumi2 = _mm256_setzero_si256(); - for (int ib = 0; ib < QK_K/32; ib += 2) { - const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16; - const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16; - const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), - _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); - const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), - _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); - const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); - const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); - const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; - const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; - sh >>= 4; - const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1)); - const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2)); - sumi1 = _mm256_add_epi32(p_1, sumi1); - sumi2 = _mm256_add_epi32(p_2, sumi2); - } - accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), - _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum); - } - - *s = hsum_float_8(accum); - -#elif defined __AVX__ - const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); - const __m128i m4b = _mm_set1_epi8(0x0f); - - __m256 accum = _mm256_setzero_ps(); - for (int ibl = 0; ibl < nb; ++ibl) { - const uint8_t * qs = x[ibl].qs; - const int8_t * q8 = y[ibl].qs; - uint16_t sh = x[ibl].scales_h; - __m128i sumi1_0 = _mm_setzero_si128(); - __m128i sumi1_1 = _mm_setzero_si128(); - __m128i sumi2_0 = _mm_setzero_si128(); - __m128i sumi2_1 = _mm_setzero_si128(); - for (int ib = 0; ib < QK_K/32; ib += 2) { - const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16; - const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16; - const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; - const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); - const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); - const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); - const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); - const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); - const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); - const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); - const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); - const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; - const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; - sh >>= 4; - const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1)); - const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1)); - const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2)); - const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2)); - sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0); - sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1); - sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0); - sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1); - } - __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0); - __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1); - accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), - _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum); - } - - *s = hsum_float_8(accum); - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector int v0 = vec_splats((int32_t)0); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - - const vector signed char values = vec_xl( 0, kvalues_iq4nl); - - for (int ibl = 0; ibl < nb; ++ibl) { - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d)); - vector float vyd = vec_splats(y[ibl].d); - vector float vd = vec_mul(vxd, vyd); - - vector signed int vsumi0 = v0; - vector signed int vsumi1 = v0; - vector signed int vsumi2 = v0; - vector signed int vsumi3 = v0; - - uint16_t h = x[ibl].scales_h; - - const uint8_t * GGML_RESTRICT q4 = x[ibl].qs; - const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l; - const int8_t * GGML_RESTRICT q8 = y[ibl].qs; - - for (int ib = 0; ib < QK_K/64; ib ++ ) { - __builtin_prefetch(q4, 0, 1); - __builtin_prefetch(q8, 0, 1); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, q4); - vector signed char qxs1 = (vector signed char)vec_xl(16, q4); - q4 += 32; - - vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask); - vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4); - vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask); - vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4); - - q4x00 = vec_perm(values, values, (vector unsigned char)q4x00); - q4x01 = vec_perm(values, values, (vector unsigned char)q4x01); - q4x10 = vec_perm(values, values, (vector unsigned char)q4x10); - q4x11 = vec_perm(values, values, (vector unsigned char)q4x11); - - vector signed char q8y0 = vec_xl( 0, q8); - vector signed char q8y1 = vec_xl(16, q8); - vector signed char q8y2 = vec_xl(32, q8); - vector signed char q8y3 = vec_xl(48, q8); - q8 += 64; - - vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1)); - vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2)); - vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3)); - - const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32); - const uint16_t ls1 = (uint16_t)(((sc[0] >> 4) | ((h << 2) & 0x30)) - 32); - h >>= 4; - sc ++; - - vector signed short vscales01 = vec_splats((int16_t)ls0); - vector signed short vscales23 = vec_splats((int16_t)ls1); - - vsumi0 = vec_msum(qv0, vscales01, vsumi0); - vsumi1 = vec_msum(qv1, vscales01, vsumi1); - vsumi2 = vec_msum(qv2, vscales23, vsumi2); - vsumi3 = vec_msum(qv3, vscales23, vsumi3); - } - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined(__loongarch_asx) - - const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0); - - __m256 accum = (__m256)__lasx_xvldi(0); - - for (int ibl = 0; ibl < nb; ++ibl) { - const uint8_t * qs = x[ibl].qs; - const int8_t * q8 = y[ibl].qs; - uint16_t sh = x[ibl].scales_h; - __m256i sumi1 = __lasx_xvldi(0); - __m256i sumi2 = __lasx_xvldi(0); - for (int ib = 0; ib < QK_K/32; ib += 2) { - const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16; - const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16; - const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32; - const __m256i q4b_1 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_1, 4)), - __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_1, 0xf))); - const __m256i q4b_2 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_2, 4)), - __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_2, 0xf))); - const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); - const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); - const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; - const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; - sh >>= 4; - const __m256i p_1 = lasx_madd_h(p16_1, __lasx_xvreplgr2vr_h(ls1)); - const __m256i p_2 = lasx_madd_h(p16_2, __lasx_xvreplgr2vr_h(ls2)); - sumi1 = __lasx_xvadd_w(p_1, sumi1); - sumi2 = __lasx_xvadd_w(p_2, sumi2); - } - accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), - __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum); - } - - *s = hsum_float_8(accum); -#elif defined(__VXE__) || defined(__VXE2__) - const int8x16_t v_k = vec_xl(0, kvalues_iq4nl); - const uint8x16_t v_m = vec_splat_u8(0x0F); - - float sumf = 0; - - for (int ibl = 0; ibl < nb; ++ibl) { - const uint8_t * GGML_RESTRICT q4 = x[ibl].qs; - const int8_t * GGML_RESTRICT q8 = y[ibl].qs; - - uint16_t h = x[ibl].scales_h; - - int sumi1 = 0, sumi2 = 0; - for (int ib = 0; ib < QK_K/64; ++ib) { - const uint8x16_t v_x0 = vec_xl(0 , q4); - const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4); - q4 += 32; - - int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); - int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); - int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); - int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); - - v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l); - v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h); - v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l); - v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h); - - const int8x16_t v_y0 = vec_xl( 0, q8); - const int8x16_t v_y1 = vec_xl(16, q8); - const int8x16_t v_y2 = vec_xl(32, q8); - const int8x16_t v_y3 = vec_xl(48, q8); - q8 += 64; - - int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1); - int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3); - - int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32; - int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32; - - h >>= 4; - - sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1; - sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2; - } - - sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); - } - - *s = sumf; - -#else - float sumf = 0; - for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; - uint16_t h = x[ibl].scales_h; - const uint8_t * qs = x[ibl].qs; - const int8_t * q8 = y[ibl].qs; - for (int ib = 0; ib < QK_K/32; ib += 2) { - const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30); - const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30); - h >>= 4; - const float d1 = d4d8*(ls1 - 32); - const float d2 = d4d8*(ls2 - 32); - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < 16; ++j) { - sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; - sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; - } - sumf += d1 * (sumi1 + sumi2); - qs += 16; - q8 += 32; - sumi1 = sumi2 = 0; - for (int j = 0; j < 16; ++j) { - sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; - sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; - } - sumf += d2 * (sumi1 + sumi2); - qs += 16; - q8 += 32; - } - } - *s = sumf; -#endif -} - -// ============================ 4-bit non-linear quants - -void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { - assert(k % QK4_NL == 0); - quantize_row_iq4_nl_ref(x, y, k); -} - -void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { - assert(k % QK_K == 0); - quantize_iq4_xs(x, y, 1, k, NULL); -} diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c7426df2b851b..7cae96f4b4885 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3,11 +3,11 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-cpu-traits.h" +#include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-cpu.h" #include "ggml-impl.h" -#include "ggml-cpu-quants.h" +#include "quants.h" #include "ggml-threading.h" #include "unary-ops.h" #include "binary-ops.h" @@ -72,15 +72,13 @@ #define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0) +// precomputed f32 table for f16 (256 KB) (simd-mappings.h) +float ggml_table_f32_f16[1 << 16]; + #if defined(__ARM_ARCH) struct ggml_arm_arch_features_type { - int has_neon; - int has_dotprod; - int has_i8mm; - int has_sve; int sve_cnt; - int has_sme; -} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1}; +} ggml_arm_arch_features = { 0 }; #endif @@ -559,6 +557,14 @@ void ggml_barrier(struct ggml_threadpool * tp) { #endif } +void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) { + atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed); +} + +int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) { + return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed); +} + #if defined(__gnu_linux__) static cpu_set_t ggml_get_numa_affinity(void) { cpu_set_t cpuset; @@ -670,87 +676,15 @@ bool ggml_is_numa(void) { #if defined(__linux__) && defined(__aarch64__) #include -#elif defined(__APPLE__) -#include -#endif - -#if !defined(HWCAP2_I8MM) -#define HWCAP2_I8MM (1 << 13) -#endif - -#if !defined(HWCAP2_SME) -#define HWCAP2_SME (1 << 23) #endif static void ggml_init_arm_arch_features(void) { -#if defined(__linux__) && defined(__aarch64__) - uint32_t hwcap = getauxval(AT_HWCAP); - uint32_t hwcap2 = getauxval(AT_HWCAP2); - - ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); - ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP); - ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); - ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); - ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME); - -#if defined(__ARM_FEATURE_SVE) +#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE) ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); #endif -#elif defined(__APPLE__) - int oldp = 0; - size_t size = sizeof(oldp); - if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) { - oldp = 0; - } - ggml_arm_arch_features.has_neon = oldp; - - if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) { - oldp = 0; - } - ggml_arm_arch_features.has_dotprod = oldp; - - if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { - oldp = 0; - } - ggml_arm_arch_features.has_i8mm = oldp; - - if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) { - oldp = 0; - } - ggml_arm_arch_features.has_sme = oldp; - - ggml_arm_arch_features.has_sve = 0; - ggml_arm_arch_features.sve_cnt = 0; -#else -// Run-time CPU feature detection not implemented for this platform, fallback to compile time -#if defined(__ARM_NEON) - ggml_arm_arch_features.has_neon = 1; -#else - ggml_arm_arch_features.has_neon = 0; -#endif - -#if defined(__ARM_FEATURE_MATMUL_INT8) - ggml_arm_arch_features.has_i8mm = 1; -#else - ggml_arm_arch_features.has_i8mm = 0; -#endif - -#if defined(__ARM_FEATURE_SVE) - ggml_arm_arch_features.has_sve = 1; - ggml_arm_arch_features.sve_cnt = 16; -#else - ggml_arm_arch_features.has_sve = 0; - ggml_arm_arch_features.sve_cnt = 0; -#endif - -#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2) - ggml_arm_arch_features.has_sme = 1; -#else - ggml_arm_arch_features.has_sme = 0; -#endif -#endif } -#endif + +#endif // __ARM_ARCH struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { GGML_ASSERT(!ggml_get_no_alloc(ctx)); @@ -805,7 +739,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value)); } } break; case GGML_TYPE_BF16: @@ -864,7 +798,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value)); } } break; case GGML_TYPE_BF16: @@ -915,7 +849,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } case GGML_TYPE_BF16: { @@ -960,7 +894,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -989,7 +923,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i case GGML_TYPE_I32: return ((int32_t *) data)[0]; case GGML_TYPE_F16: - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); case GGML_TYPE_BF16: return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); case GGML_TYPE_F32: @@ -1016,7 +950,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -1054,7 +988,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { } case GGML_TYPE_F16: { - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } case GGML_TYPE_BF16: { @@ -1093,7 +1027,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -1120,7 +1054,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, case GGML_TYPE_I32: return ((int32_t *) data)[0]; case GGML_TYPE_F16: - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); case GGML_TYPE_BF16: return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); case GGML_TYPE_F32: @@ -1147,7 +1081,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -1959,6 +1893,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad_reflect_1d(params, tensor); } break; + case GGML_OP_ROLL: + { + ggml_compute_forward_roll(params, tensor); + } break; case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); @@ -2283,6 +2221,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: + case GGML_OP_ROLL: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: @@ -3205,9 +3144,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); _mm_storel_epi64((__m128i *)(y + i), y_vec); } +#elif defined(__NNPA__) + for (; i + 7 < n; i += 8) { + float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0)); + float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4)); + uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0); + uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); + vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); + } + for (; i + 3 < n; i += 4) { + float32x4_t v_x = vec_xl(0, (const float *)(x + i)); + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0); + uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); + vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); + } #endif for (; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(x[i]); } } @@ -3231,9 +3185,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { __m128 y_vec = _mm_cvtph_ps(x_vec); _mm_storeu_ps(y + i, y_vec); } +#elif defined(__NNPA__) + for (; i + 7 < n; i += 8) { + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i)); + uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0); + float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0); + float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0); + vec_xst(v_yh, 0, (float *)(y + i + 0)); + vec_xst(v_yl, 0, (float *)(y + i + 4)); + } + for (; i + 3 < n; i += 4) { + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i)); + uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0); + float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0); + vec_xst(v_yh, 0, (float *)(y + i)); + } #endif + for (; i < n; ++i) { - y[i] = GGML_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP16_TO_FP32(x[i]); } } @@ -3433,9 +3403,17 @@ int ggml_cpu_has_vxe(void) { #endif } +int ggml_cpu_has_nnpa(void) { +#if defined(GGML_NNPA) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_neon(void) { #if defined(__ARM_ARCH) && defined(__ARM_NEON) - return ggml_arm_arch_features.has_neon; + return 1; #else return 0; #endif @@ -3443,7 +3421,7 @@ int ggml_cpu_has_neon(void) { int ggml_cpu_has_dotprod(void) { #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD) - return ggml_arm_arch_features.has_dotprod; + return 1; #else return 0; #endif @@ -3451,7 +3429,7 @@ int ggml_cpu_has_dotprod(void) { int ggml_cpu_has_sve(void) { #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) - return ggml_arm_arch_features.has_sve; + return 1; #else return 0; #endif @@ -3459,7 +3437,7 @@ int ggml_cpu_has_sve(void) { int ggml_cpu_has_matmul_int8(void) { #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8) - return ggml_arm_arch_features.has_i8mm; + return 1; #else return 0; #endif @@ -3475,14 +3453,14 @@ int ggml_cpu_get_sve_cnt(void) { int ggml_cpu_has_sme(void) { #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME) - return ggml_arm_arch_features.has_sme; + return 1; #else return 0; #endif } void ggml_cpu_init(void) { - // needed to initialize f16 tables + // needed to initialize ggml_time { struct ggml_init_params params = { 0, NULL, false }; struct ggml_context * ctx = ggml_init(params); @@ -3503,9 +3481,10 @@ void ggml_cpu_init(void) { uint16_t u16; ggml_fp16_t fp16; } u = {i}; - float f = GGML_FP16_TO_FP32(u.fp16); - ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); - ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); + float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16); + ggml_table_f32_f16[i] = f; + ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f)); + ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f)); } const uint64_t t_end = ggml_time_us(); UNUSED(t_end); diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index e013e8b416222..a98866a2d8052 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -1,8 +1,8 @@ #include "ggml-backend.h" #include "ggml-backend-impl.h" #include "ggml-cpu.h" -#include "ggml-cpu-aarch64.h" -#include "ggml-cpu-traits.h" +#include "repack.h" +#include "traits.h" #include "ggml-impl.h" #include "amx/amx.h" @@ -11,7 +11,7 @@ #include #ifdef GGML_USE_CPU_HBM -# include "ggml-cpu-hbm.h" +# include "hbm.h" #endif #ifdef GGML_USE_CPU_KLEIDIAI @@ -51,9 +51,9 @@ std::vector& ggml_backend_cpu_get_extra_buffers_type } #endif -#ifdef GGML_USE_CPU_AARCH64 - if (ggml_backend_cpu_aarch64_buffer_type()) { - bufts.push_back(ggml_backend_cpu_aarch64_buffer_type()); +#ifdef GGML_USE_CPU_REPACK + if (ggml_backend_cpu_repack_buffer_type()) { + bufts.push_back(ggml_backend_cpu_repack_buffer_type()); } #endif @@ -578,6 +578,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_vxe()) { features.push_back({ "VXE", "1" }); } + if (ggml_cpu_has_nnpa()) { + features.push_back({ "NNPA", "1" }); + } if (ggml_cpu_has_wasm_simd()) { features.push_back({ "WASM_SIMD", "1" }); } @@ -596,8 +599,8 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r #ifdef GGML_USE_CPU_KLEIDIAI features.push_back({ "KLEIDIAI", "1" }); #endif - #ifdef GGML_USE_CPU_AARCH64 - features.push_back({ "AARCH64_REPACK", "1" }); + #ifdef GGML_USE_CPU_REPACK + features.push_back({ "REPACK", "1" }); #endif features.push_back({ nullptr, nullptr }); diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp b/ggml/src/ggml-cpu/hbm.cpp similarity index 98% rename from ggml/src/ggml-cpu/ggml-cpu-hbm.cpp rename to ggml/src/ggml-cpu/hbm.cpp index fa8dea2af9c72..a4073c15e6c90 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +++ b/ggml/src/ggml-cpu/hbm.cpp @@ -5,7 +5,7 @@ #include "ggml-cpu.h" #include "ggml-impl.h" -#include "ggml-cpu-hbm.h" +#include "hbm.h" // buffer type HBM diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.h b/ggml/src/ggml-cpu/hbm.h similarity index 100% rename from ggml/src/ggml-cpu/ggml-cpu-hbm.h rename to ggml/src/ggml-cpu/hbm.h diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 15f0cd1540686..fafe45e6c5c51 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -26,7 +26,7 @@ #include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-threading.h" -#include "ggml-cpu-traits.h" +#include "traits.h" #include "kernels.h" diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 1d46158f928c4..ed61869a5508a 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -52,8 +52,8 @@ #include "ggml-impl.h" #include "ggml-cpu-impl.h" #include "ggml-quants.h" +#include "simd-mappings.h" -#include #include #include @@ -63,7 +63,7 @@ #define NOINLINE __attribute__((__noinline__)) #endif -#if defined(__ARM_NEON) || defined(__AVX512F__) +#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__) #define VECTOR_REGISTERS 32 #else #define VECTOR_REGISTERS 16 @@ -74,7 +74,7 @@ namespace { inline float unhalf(ggml_fp16_t d) { - return GGML_FP16_TO_FP32(d); + return GGML_CPU_FP16_TO_FP32(d); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -110,6 +110,12 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); } inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(__VXE__) || defined(__VXE2__) +inline float32x4_t add(float32x4_t x, float32x4_t y) { return vec_add(x, y); } +inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vec_sub(x, y); } +inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); } +#endif + #if defined(__MMA__) typedef vector unsigned char vec_t; typedef __vector_quad acc_t; @@ -163,6 +169,13 @@ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) { #endif #endif +#if defined(__VXE__) || defined(__VXE2__) +template <> +inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) { + return vec_madd(a, b, c); +} +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // VECTORIZED HORIZONTAL SUM @@ -179,6 +192,13 @@ inline float hsum(float16x8_t x) { } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(__VXE__) || defined(__VXE2__) +inline float hsum(float32x4_t x) { + float32x4_t tmp = x + vec_reve(x); + return tmp[0] + tmp[1]; +} +#endif + #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) inline float hsum(__m128 x) { #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) @@ -228,6 +248,21 @@ template <> inline float32x4_t load(const ggml_fp16_t *p) { #endif // _MSC_VER #endif // __ARM_NEON +#if defined(__VXE__) || defined(__VXE2__) +template <> inline float32x4_t load(const ggml_fp16_t * p) { + float tmp[4]; + + for (int i = 0; i < 4; i++) { + tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]); + } + + return vec_xl(0, (const float *)(tmp)); +} +template <> inline float32x4_t load(const float * p) { + return vec_xl(0, p); +} +#endif + #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) template <> inline __m128 load(const float *p) { return _mm_loadu_ps(p); @@ -394,8 +429,6 @@ class tinyBLAS { template NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) { - static std::atomic current_chunk; - GGML_ASSERT(m % (RM * BM) == 0); const int64_t ytiles = m / (RM * BM); const int64_t xtiles = (n + RN -1) / RN; @@ -410,7 +443,7 @@ class tinyBLAS { if (params->ith == 0) { GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles); // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed); + ggml_threadpool_chunk_set(params->threadpool, params->nth); } ggml_barrier(params->threadpool); @@ -439,8 +472,7 @@ class tinyBLAS { GGML_ASSERT(jj == jj2); } - // next step. - job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed); + job = ggml_threadpool_chunk_add(params->threadpool, 1); } ggml_barrier(params->threadpool); @@ -3323,6 +3355,14 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 (const float *)B, ldb, (float *)C, ldc}; return tb.matmul(m, n); +#elif defined(__VXE__) || defined(__VXE2__) + if (n < 4) + return false; + tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params, + k, (const float *)A, lda, + (const float *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); #elif defined(__MMA__) if (k % 8) return false; @@ -3414,6 +3454,16 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 (float *)C, ldc}; return tb.matmul(m, n); } +#elif defined(__VXE__) || defined(__VXE2__) + if (n < 4) + return false; + if (Btype == GGML_TYPE_F16) { + tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params, + k, (const ggml_fp16_t *)A, lda, + (const ggml_fp16_t *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } #endif return false; } diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h index 3d2909515242a..729e8853d516c 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.h +++ b/ggml/src/ggml-cpu/llamafile/sgemm.h @@ -1,6 +1,11 @@ #pragma once #include #include + +#if defined(__VXE__) || defined(__VXE2__) +#include +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index d8de7531b0e5f..8531baf6c57fb 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -108,7 +108,7 @@ static void ggml_compute_forward_dup_f16( for (int i01 = ir0; i01 < ir1; i01++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); + dst_ptr[id] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]); id++; } } @@ -130,7 +130,7 @@ static void ggml_compute_forward_dup_f16( const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + src0_f32[i00] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]); } quantize_row_q(src0_f32, dst_ptr + id, ne00); @@ -156,7 +156,7 @@ static void ggml_compute_forward_dup_f16( for (int i00 = 0; i00 < ne00; i00++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + dst_ptr[id] = GGML_CPU_FP16_TO_FP32(*src0_ptr); id++; } } @@ -267,7 +267,7 @@ static void ggml_compute_forward_dup_f16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + *(float *) dst_ptr = GGML_CPU_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -372,7 +372,7 @@ static void ggml_compute_forward_dup_bf16( for (int i01 = ir0; i01 < ir1; i01++) { const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); id++; } } @@ -473,7 +473,7 @@ static void ggml_compute_forward_dup_bf16( for (int i00 = 0; i00 < ne00; i00++) { const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); id++; } } @@ -566,7 +566,7 @@ static void ggml_compute_forward_dup_bf16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); + *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); if (++i10 == ne0) { i10 = 0; @@ -765,7 +765,7 @@ static void ggml_compute_forward_dup_f32( for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr); id++; } } @@ -878,7 +878,7 @@ static void ggml_compute_forward_dup_f32( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -1419,7 +1419,7 @@ static void ggml_compute_forward_add1_f16_f32( ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v); } } } @@ -1435,7 +1435,7 @@ static void ggml_compute_forward_add1_f16_f16( GGML_ASSERT(ggml_is_scalar(src1)); // scalar to add - const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); + const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); const int ith = params->ith; const int nth = params->nth; @@ -1467,7 +1467,7 @@ static void ggml_compute_forward_add1_f16_f16( ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v); } } } @@ -1889,7 +1889,7 @@ static void ggml_compute_forward_sum_f16( } } } - ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); + ((ggml_fp16_t *) dst->data)[0] = GGML_CPU_FP32_TO_FP16(sum); } static void ggml_compute_forward_sum_bf16( @@ -2660,7 +2660,7 @@ static void ggml_compute_forward_gelu_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2763,7 +2763,7 @@ static void ggml_compute_forward_gelu_erf_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2866,7 +2866,7 @@ static void ggml_compute_forward_gelu_quick_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2969,7 +2969,7 @@ static void ggml_compute_forward_silu_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -3163,7 +3163,7 @@ static void ggml_compute_forward_silu_back_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -4500,7 +4500,7 @@ static void ggml_compute_forward_get_rows_back_f32_f16( for (int j = 0; j < nc; ++j) { ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); + ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v); } } } @@ -4792,7 +4792,7 @@ static void ggml_compute_forward_soft_max_f32( if (mp_f32) { if (use_f16) { for (int i = 0; i < nc; ++i) { - wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]); + wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]); } } else { for (int i = 0; i < nc; ++i) { @@ -5018,8 +5018,8 @@ static void ggml_compute_forward_clamp_f16( ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); for (int i = 0; i < nc; i++) { - float v = GGML_FP16_TO_FP32(src0_ptr[i]); - dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min)); + float v = GGML_CPU_FP16_TO_FP32(src0_ptr[i]); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(MAX(MIN(v, max), min)); } } } @@ -5476,11 +5476,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { @@ -5492,11 +5492,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } } else { @@ -5507,11 +5507,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[1]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[1]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } @@ -5525,11 +5525,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { @@ -5640,7 +5640,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]); + dst_data[i10*ne11 + i11] = GGML_CPU_FP32_TO_FP16(src[i10]); } } } @@ -5933,7 +5933,7 @@ static void ggml_compute_forward_im2col_f16( if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; } else { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]); } } } @@ -6109,7 +6109,7 @@ void ggml_compute_forward_conv_transpose_2d( const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; for (int i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); + dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]); } } } @@ -6358,7 +6358,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { case GGML_OP_POOL_AVG: drow[i] += srow_j; break; case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; @@ -6450,7 +6450,7 @@ void ggml_compute_forward_pool_2d( for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; if (j < 0 || j >= src->ne[0]) continue; - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { case GGML_OP_POOL_AVG: *out += srow_j; break; case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; @@ -6538,7 +6538,7 @@ void ggml_compute_forward_pool_2d_back( } const float val = dst->type == GGML_TYPE_F32 ? - ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); + ((const float *) drowf)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); if (val <= maxval) { continue; } @@ -6558,7 +6558,7 @@ void ggml_compute_forward_pool_2d_back( if (dst->type == GGML_TYPE_F32) { ((float *) drow)[j] += grad0; } else { - ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); + ((ggml_fp16_t *) drow)[j] = GGML_CPU_FP32_TO_FP16(grad0 + GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); } } else if (op == GGML_OP_POOL_AVG) { const float grad = grad0 / ka; @@ -6577,7 +6577,7 @@ void ggml_compute_forward_pool_2d_back( if (dst->type == GGML_TYPE_F32) { ((float *) drow)[j] += grad; } else { - ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad); + ((ggml_fp16_t *) drow)[j] += GGML_CPU_FP32_TO_FP16(grad); } } } @@ -6793,6 +6793,73 @@ void ggml_compute_forward_pad_reflect_1d( } } +// ggml_compute_forward_roll + +static int64_t ggml_wrap_index(int64_t i, int64_t ne) { + if (i < 0) { + return i + ne; + } else if (i >= ne) { + return i - ne; + } + return i; +} + +static void ggml_compute_forward_roll_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const float * src_data = (const float *) src0->data; + float * dst_data = (float *) dst->data; + + GGML_TENSOR_UNARY_OP_LOCALS + + const int s0 = ggml_get_op_params_i32(dst, 0); + const int s1 = ggml_get_op_params_i32(dst, 1); + const int s2 = ggml_get_op_params_i32(dst, 2); + const int s3 = ggml_get_op_params_i32(dst, 3); + + const int64_t total = ne1 * ne2 * ne3; + const int64_t per_thread = (total + params->nth) / params->nth; + const int64_t start = params->ith * per_thread; + const int64_t end = std::min(start + per_thread, total); + + for (int64_t i = start; i < end; ++i) { + const int64_t i1 = i % ne1; + const int64_t i2 = (i / ne1) % ne2; + const int64_t i3 = i / (ne2 * ne1); + float * dst_row = dst_data + (i3*nb3 + i2*nb2 + i1*nb1) / sizeof(float); + + const int64_t i01 = ggml_wrap_index(i1 - s1, ne01); + const int64_t i02 = ggml_wrap_index(i2 - s2, ne02); + const int64_t i03 = ggml_wrap_index(i3 - s3, ne03); + const float * src_row = src_data + (i03*nb03 + i02*nb02 + i01*nb01) / sizeof(float); + + const int64_t s = ggml_wrap_index(-s0, ne00); + const int64_t n = ne00 - s; + ggml_vec_cpy_f32(n, dst_row, src_row + s); + ggml_vec_cpy_f32(s, dst_row + n, src_row); + } +} + +void ggml_compute_forward_roll( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_roll_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_arange static void ggml_compute_forward_arange_f32( @@ -7075,7 +7142,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( // loop over n_kv and n_head_kv // ref: https://arxiv.org/pdf/2112.05682.pdf for (int64_t ic = 0; ic < nek1; ++ic) { - const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f; + const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f; if (mv == -INFINITY) { continue; } @@ -7143,7 +7210,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( if (v->type == GGML_TYPE_F16) { for (int64_t d = 0; d < DV; ++d) { - VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]); + VKQ32[d] = GGML_CPU_FP16_TO_FP32(VKQ16[d]); } } @@ -8132,8 +8199,8 @@ static void ggml_compute_forward_rwkv_wkv6_f32( #define WKV_VECTOR_SIZE 4 #endif - int wkv_vector_size; #ifdef WKV_VECTOR_SIZE + int wkv_vector_size; #if defined(__ARM_FEATURE_SVE) wkv_vector_size = svcntw(); #else @@ -8348,8 +8415,8 @@ static void ggml_compute_forward_gla_f32( #define GLA_VECTOR_SIZE 4 #endif - int gla_vector_size; #ifdef GLA_VECTOR_SIZE + int gla_vector_size; #if defined(__ARM_FEATURE_SVE) gla_vector_size = svcntw(); #else diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index dc081b9e66397..2d8544d7d3d43 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c new file mode 100644 index 0000000000000..ee35ab42fda07 --- /dev/null +++ b/ggml/src/ggml-cpu/quants.c @@ -0,0 +1,1158 @@ +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" + +#include "ggml-cpu-impl.h" +#include "simd-mappings.h" +#include "ggml-quants.h" +#include "quants.h" + +#include "arch-fallback.h" + +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#define GROUP_MAX_EPS 1e-15f +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f +#define GROUP_MAX_EPS_IQ2_S 1e-8f +#define GROUP_MAX_EPS_IQ1_M 1e-7f +#define GROUP_MAX_EPS_IQ1_S 1e-12f + +#define UNUSED GGML_UNUSED + +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q4_0_ref(x, y, k); +} + +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q4_1_ref(x, y, k); +} + +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q5_0_ref(x, y, k); +} + +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q5_1_ref(x, y, k); +} + +void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q8_0_ref(x, y, k); +} + +void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q8_1_ref(x, y, k); +} + +// +// 2-6 bit quantization in super-blocks +// + +//========================- 2-bit (de)-quantization + +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + quantize_row_q2_K_ref(x, vy, k); +} + +//========================= 3-bit (de)-quantization + +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + quantize_row_q3_K_ref(x, vy, k); +} + +// ====================== 4-bit (de)-quantization + +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + block_q4_K * GGML_RESTRICT y = vy; + quantize_row_q4_K_ref(x, y, k); +} + +// ====================== 5-bit (de)-quantization + +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + block_q5_K * GGML_RESTRICT y = vy; + quantize_row_q5_K_ref(x, y, k); +} + +// ====================== 6-bit (de)-quantization + +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + block_q6_K * GGML_RESTRICT y = vy; + quantize_row_q6_K_ref(x, y, k); +} + +// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs) + +void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + block_tq1_0 * GGML_RESTRICT y = vy; + quantize_row_tq1_0_ref(x, y, k); +} + +void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + block_tq2_0 * GGML_RESTRICT y = vy; + quantize_row_tq2_0_ref(x, y, k); +} + +//===================================== Q8_K ============================================== + +void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q8_K_ref(x, y, k); +} + +//===================================== Dot products ================================= + +void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; +} + +// TODO: add WASM SIMD +void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F); + const int v1 = (x[ib].qs[j] >> 4); + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; + } + + *s = sumf; +} + +void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + int ib = 0; + float sumf = 0; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + for (; ib < nb; ++ib) { + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; + + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); + } + + *s = sumf; +} + +void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + + for (; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j]*y[ib].qs[j]; + } + + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + +void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq1_0 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243}; + + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int sum = 0; + + for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = x[i].qs[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*32 + m]; + } + } + } + for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) { + for (size_t l = 0; l < 5; ++l) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = x[i].qs[j + m] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[j*5 + l*16 + m]; + } + } + } + + for (size_t l = 0; l < 4; ++l) { + for (size_t j = 0; j < sizeof(x->qh); ++j) { + uint8_t q = x[i].qh[j] * pow3[l]; + uint16_t xi = ((uint16_t) q * 3) >> 8; + sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j]; + } + } + + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); + } + + *s = sumf; +} + +void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_tq2_0 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + float sumf = 0.0f; + + for (int i = 0; i < nb; ++i) { + int32_t sumi = 0; + + for (size_t j = 0; j < sizeof(x->qs); j += 32) { + for (size_t l = 0; l < 4; ++l) { + for (size_t k = 0; k < 32; ++k) { + sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1); + } + } + } + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + sumf += (float) sumi * d; + } + + *s = sumf; +} + +void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + int isum = 0; + int is = 0; + int d; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + d = sc[is++] & 0xF; + int isuml = 0; + for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + d = sc[is++] & 0xF; + isuml = 0; + for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); + isum += d * isuml; + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * isum - dmin * summs; + } + *s = sumf; +} + +void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + // scalar version + // This function is written like this so the compiler can manage to vectorize most of it + // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the + // manually vectorized version above. Every other version I tried would run at least 4 times slower. + // The ideal situation would be if we could just write the code once, and the compiler would + // automatically produce the best possible set of machine instructions, instead of us having to manually + // write vectorized versions for AVX, ARM_NEON, etc. + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + uint32_t auxs[4]; + const int8_t * scales = (const int8_t*)auxs; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; + for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); + a += 32; m <<= 1; + q3 += 32; + } + a = aux8; + + memcpy(auxs, x[i].scales, 12); + uint32_t tmp = auxs[2]; + auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +} + +void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + a += 32; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + a += 32; q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +} + +void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K/64; ++j) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4); + for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +} + +void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +} + +void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + uint32_t aux32[2]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(aux32, q2, 2*sizeof(uint32_t)); + q2 += 4; + const uint32_t ls = 2*(aux32[1] >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +} + +void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * GGML_RESTRICT q2 = x[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; + const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls1; + sumi = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls2; + q2 += 4; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +} + +void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq2_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint8_t * signs = qs + QK_K/8; + + int bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf); + int ls2 = 1 + 2*(x[i].scales[ib32] >> 4); + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300))); + for (int j = 0; j < 8; ++j) { + sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += ls1 * sumi1 + ls2 * sumi2; + qs += 4; + signs += 4; + } + + sumf += d * bsum; + } + + *s = 0.125f * sumf; +} + +void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_xxs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + uint32_t aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); + const uint32_t ls = 2*(aux32 >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]); + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + q3 += 8; + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +} + +void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * GGML_RESTRICT qs = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const uint8_t * GGML_RESTRICT signs = x[i].signs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; + const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls1; + sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls2; + } + sumf += d * bsum; + } + *s = sumf; +} + +void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_s * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; + + int sumi = 0, sumi1 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + const int ls = 2*((qh[ib] >> 12) & 7) + 1; + const int delta = qh[ib] & 0x8000 ? -1 : 1; + int lsum = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8))); + for (int j = 0; j < 8; ++j) { + lsum += q8[j] * grid[j]; + } + q8 += 8; + } + sumi += ls * lsum; + sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]); + qs += 4; + } + + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + } + + *s = sumf; +} + +void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_m * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + iq1m_scale_t scale; + + int sum1[2], sum2[2], delta[4]; + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + int sumi1 = 0, sumi2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + delta[0] = qh[0] & 0x08 ? -1 : 1; + delta[1] = qh[0] & 0x80 ? -1 : 1; + delta[2] = qh[1] & 0x08 ? -1 : 1; + delta[3] = qh[1] & 0x80 ? -1 : 1; + sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700))); + int lsum1 = 0, lsum2 = 0; + for (int j = 0; j < 8; ++j) { + lsum1 += q8[j] * grid[j]; + lsum2 += q8[j]; + } + q8 += 8; + sum1[l/2] += lsum1; + sum2[l/2] += lsum2*delta[l]; + } + + const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1; + const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1; + + sumi1 += sum1[0] * ls1 + sum1[1] * ls2; + sumi2 += sum2[0] * ls1 + sum2[1] * ls2; + qs += 4; + qh += 2; + } + + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + } + + *s = sumf; +} + +void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_iq4_nl * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + int ib = 0; + float sumf = 0; + + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +} + +void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK_K == 0); + + const block_iq4_xs * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + float sumf = 0; + for (int ibl = 0; ibl < nb; ++ibl) { + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + uint16_t h = x[ibl].scales_h; + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + for (int ib = 0; ib < QK_K/32; ib += 2) { + const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30); + const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30); + h >>= 4; + const float d1 = d4d8*(ls1 - 32); + const float d2 = d4d8*(ls2 - 32); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d1 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + sumi1 = sumi2 = 0; + for (int j = 0; j < 16; ++j) { + sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf]; + sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4]; + } + sumf += d2 * (sumi1 + sumi2); + qs += 16; + q8 += 32; + } + } + *s = sumf; +} + +// ============================ 4-bit non-linear quants + +void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + assert(k % QK4_NL == 0); + quantize_row_iq4_nl_ref(x, y, k); +} + +void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq4_xs(x, y, 1, k, NULL); +} diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ggml/src/ggml-cpu/quants.h similarity index 56% rename from ggml/src/ggml-cpu/ggml-cpu-quants.h rename to ggml/src/ggml-cpu/quants.h index e33d9d473ea66..dc4342c87f592 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -58,6 +58,32 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +// Generic implementation +void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp new file mode 100644 index 0000000000000..72ee93a5abc7c --- /dev/null +++ b/ggml/src/ggml-cpu/repack.cpp @@ -0,0 +1,1571 @@ +#define GGML_COMMON_IMPL_CPP +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" +#include "ggml-backend-impl.h" + +#include "ggml-impl.h" +#include "ggml-cpu.h" +#include "ggml-cpu-impl.h" +#include "simd-mappings.h" +#include "traits.h" + +#include "arch-fallback.h" + +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT + +#include "repack.h" + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Woverlength-strings" +#endif + +#define UNUSED GGML_UNUSED + +static inline int nearest_int(float fval) { + assert(fabsf(fval) <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +// Functions to create the interleaved data layout formats + +// interleave 4 block_q4_0s in blocks of blck_size_interleave +// returns an interleaved block_q4_0x4 +// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks +// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave +// +// - in : an array of block_q4_0 pointers +// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of +// blck_size_interleave bytes +// - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes +// from bias offset form to pure sign form (this saves subtract +// operations durin unpacking) +// + +extern "C" { + +void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; + + // scalar + const int blck_size_interleave = 4; + float srcv[4][QK8_0]; + float id[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + amax = MAX(amax, fabsf(srcv[row_iter][j])); + } + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); + } + + for (int j = 0; j < QK8_0 * 4; j++) { + int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; + int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; + src_offset += (j % blck_size_interleave); + + float x0 = srcv[src_id][src_offset] * id[src_id]; + y[i].qs[j] = roundf(x0); + } + } +} + +void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; + + // scalar + const int blck_size_interleave = 8; + float srcv[4][QK8_0]; + float id[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + amax = MAX(amax, fabsf(srcv[row_iter][j])); + } + + const float d = amax / ((1 << 7) - 1); + id[row_iter] = d ? 1.0f / d : 0.0f; + + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); + } + + for (int j = 0; j < QK8_0 * 4; j++) { + int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; + int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; + src_offset += (j % blck_size_interleave); + + float x0 = srcv[src_id][src_offset] * id[src_id]; + y[i].qs[j] = roundf(x0); + } + } +} + +void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(QK_K == 256); + assert(k % QK_K == 0); + const int nb = k / QK_K; + + block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy; + + // scalar + const int blck_size_interleave = 8; + float srcv[4][QK_K]; + float iscale[4]; + + for (int i = 0; i < nb; i++) { + for (int row_iter = 0; row_iter < 4; row_iter++) { + float amax = 0.0f; // absolute max + float max = 0; + + for (int j = 0; j < QK_K; j++) { + srcv[row_iter][j] = x[row_iter * k + i * QK_K + j]; + // Update the maximum value of the corresponding super block + if(amax < fabsf(srcv[row_iter][j])) { + amax = fabsf(srcv[row_iter][j]); + max = srcv[row_iter][j]; + } + } + + iscale[row_iter] = amax ? -127.f/max : 0; + + y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0; + } + + for (int j = 0; j < QK_K / 4; j++) { + y[i].bsums[j] = 0; + } + + // Quants values are interleaved in sequence of eight bytes from corresponding super blocks + // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving + // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on + for (int j = 0; j < QK_K * 4; j++) { + int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; + int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; + src_offset += (j % blck_size_interleave); + int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3); + + float x0 = srcv[src_id][src_offset] * iscale[src_id]; + y[i].qs[j] = nearest_int(x0); + y[i].bsums[index] += y[i].qs[j]; + } + } +} + +} // extern "C" + +template +void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row); + +template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { + assert(nrow == 4); + UNUSED(nrow); + ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row); +} + +template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { + assert(nrow == 4); + UNUSED(nrow); + ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row); +} + +template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { + assert(nrow == 4); + UNUSED(nrow); + ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row); +} + +extern "C" { + +void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } +} + +void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } +} + +void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + { + float sumf[8]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } + } +} + +void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK_K; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + float sumf[8]; + float sum_minf[8]; + uint32_t utmp[32]; + int sumi1; + int sumi2; + int sumi; + + const block_q8_K * a_ptr = (const block_q8_K *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) { + sumf[j] = 0.0; + sum_minf[j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int sb = 0; sb < 8; sb++) { + memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); + utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); + const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; + utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); + utmp[sb * 4 + 2] = uaux_0; + utmp[sb * 4 + 0] &= kmask1; + } + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32; + uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16; + for (int j = 0; j < ncols_interleaved; j++) { + sumi1 = 0; + sumi2 = 0; + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); + sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]); + sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]); + sumi1 = sumi1 * scales_0[j]; + sumi2 = sumi2 * scales_1[j]; + sumi += sumi1 + sumi2; + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; + } + } + for (int sb = 0; sb < 8; sb++) { + uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; + for (int j = 0; j < ncols_interleaved; j++) { + sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; + } + } + } + for (int j = 0; j < ncols_interleaved; j++) { + s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j]; + } + } +} + +void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + { + float sumf[4]; + int sumi; + + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + + for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; + const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; + sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); + } + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); + } + } + } + for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; + } + } +} + +void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + { + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } + } +} + +void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } +} + +void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + float sumf[4][8]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } +} + +void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK_K; + const int nb = n / qk; + const int ncols_interleaved = 8; + const int blocklen = 8; + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + float sumf[4][8]; + float sum_minf[4][8]; + uint32_t utmp[32]; + int sumi1; + int sumi2; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumf[m][j] = 0.0; + sum_minf[m][j] = 0.0; + } + } + for (int l = 0; l < nb; l++) { + for (int sb = 0; sb < 8; sb++) { + memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); + utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); + const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; + utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); + utmp[sb * 4 + 2] = uaux_0; + utmp[sb * 4 + 0] &= kmask1; + } + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32; + uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16; + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi1 = 0; + sumi2 = 0; + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); + const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); + sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]); + sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); + sumi1 = sumi1 * scales_0[j]; + sumi2 = sumi2 * scales_1[j]; + sumi += sumi1 + sumi2; + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; + } + } + } + for (int sb = 0; sb < 8; sb++) { + uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; + for(int m = 0; m < 4; m++) { + const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); + for(int j = 0; j < ncols_interleaved; j++) { + sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j]; + } + } + } + } +} + +void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { + const int qk = QK8_0; + const int nb = n / qk; + const int ncols_interleaved = 4; + const int blocklen = 4; + + assert (n % qk == 0); + assert (nr % 4 == 0); + assert (nc % ncols_interleaved == 0); + + UNUSED(s); + UNUSED(bs); + UNUSED(vx); + UNUSED(vy); + UNUSED(nr); + UNUSED(nc); + UNUSED(nb); + UNUSED(ncols_interleaved); + UNUSED(blocklen); + + { + float sumf[4][4]; + int sumi; + + for (int y = 0; y < nr / 4; y++) { + const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); + for (int x = 0; x < nc / ncols_interleaved; x++) { + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; + } + for (int l = 0; l < nb; l++) { + for (int k = 0; k < (qk / (2 * blocklen)); k++) { + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) { + sumi = 0; + for (int i = 0; i < blocklen; ++i) { + const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; + const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; + sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); + } + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); + } + } + } + } + for (int m = 0; m < 4; m++) { + for (int j = 0; j < ncols_interleaved; j++) + s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; + } + } + } + } +} + +} // extern "C" + +static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { + block_q4_0x4 out; + + for (int i = 0; i < 4; i++) { + out.d[i] = in[i].d; + } + + const int end = QK4_0 * 2 / blck_size_interleave; + + if (blck_size_interleave == 8) { + const uint64_t xor_mask = 0x8888888888888888ULL; + for (int i = 0; i < end; ++i) { + int src_id = i % 4; + int src_offset = (i / 4) * blck_size_interleave; + int dst_offset = i * blck_size_interleave; + + uint64_t elems; + // Using memcpy to avoid unaligned memory accesses + memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); + elems ^= xor_mask; + memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); + } + } else if (blck_size_interleave == 4) { + const uint32_t xor_mask = 0x88888888; + for (int i = 0; i < end; ++i) { + int src_id = i % 4; + int src_offset = (i / 4) * blck_size_interleave; + int dst_offset = i * blck_size_interleave; + + uint32_t elems; + memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t)); + elems ^= xor_mask; + memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t)); + } + } else { + GGML_ASSERT(false); + } + + return out; +} + +// interleave 8 block_q4_0s in blocks of blck_size_interleave +// returns an interleaved block_q4_0x8 +// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks +// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave +static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) { + block_q4_0x8 out; + + for (int i = 0; i < 8; i++) { + out.d[i] = in[i].d; + } + + const int end = QK4_0 * 4 / blck_size_interleave; + const uint64_t xor_mask = 0x8888888888888888ULL; + + for (int i = 0; i < end; ++i) { + int src_id = i % 8; + int src_offset = (i / 8) * blck_size_interleave; + int dst_offset = i * blck_size_interleave; + + uint64_t elems; + memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); + elems ^= xor_mask; + memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); + } + + return out; +} + +static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) { + block_q4_Kx8 out; + //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure + for (int i = 0; i < 8; i++) { + out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d; + } + + for (int i = 0; i < 8; i++) { + out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin; + } + + const int end = QK_K * 4 / blck_size_interleave; + + // Interleave Q4_K quants by taking 8 bytes at a time + for (int i = 0; i < end; ++i) { + int src_id = i % 8; + int src_offset = (i / 8) * blck_size_interleave; + int dst_offset = i * blck_size_interleave; + + uint64_t elems; + memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); + memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); + } + + // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K + // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value) + // The output Q4_Kx8 structure has 96 bytes + // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure + // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures + uint8_t s[8], m[8]; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 8; j++) { + s[j] = in[j].scales[i] & 63; + m[j] = in[j].scales[i + 4] & 63; + } + + out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2); + out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2); + out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2); + out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2); + out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2); + out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2); + out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2); + out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2); + out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4); + out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4); + out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4); + out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4); + + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 8; j++) { + s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15); + m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4); + } + + out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2); + out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2); + out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2); + out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2); + out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2); + out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2); + out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2); + out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2); + out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4); + out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4); + out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4); + out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4); + + } + + return out; +} + +static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_Q4_0); + GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + constexpr int nrows_interleaved = 4; + + block_q4_0x4 * dst = (block_q4_0x4 *)t->data; + const block_q4_0 * src = (const block_q4_0 *)data; + block_q4_0 dst_tmp[4]; + int nrow = ggml_nrows(t); + int nblocks = t->ne[0] / QK4_0; + + GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); + + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++) { + dst_tmp[i] = src[x + i * nblocks]; + } + *dst++ = make_block_q4_0x4(dst_tmp, interleave_block); + } + src += nrows_interleaved * nblocks; + } + return 0; + + GGML_UNUSED(data_size); +} +static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_Q4_K); + GGML_ASSERT(interleave_block == 8); + constexpr int nrows_interleaved = 8; + + block_q4_Kx8 * dst = (block_q4_Kx8*)t->data; + const block_q4_K * src = (const block_q4_K*) data; + block_q4_K dst_tmp[8]; + int nrow = ggml_nrows(t); + int nblocks = t->ne[0] / QK_K; + + GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K)); + + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++ ) { + dst_tmp[i] = src[x + i * nblocks]; + } + *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block); + } + src += nrows_interleaved * nblocks; + } + return 0; + + GGML_UNUSED(data_size); +} + +static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_Q4_0); + GGML_ASSERT(interleave_block == 8); + constexpr int nrows_interleaved = 8; + + block_q4_0x8 * dst = (block_q4_0x8*)t->data; + const block_q4_0 * src = (const block_q4_0*) data; + block_q4_0 dst_tmp[8]; + int nrow = ggml_nrows(t); + int nblocks = t->ne[0] / QK4_0; + + GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); + + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++ ) { + dst_tmp[i] = src[x + i * nblocks]; + } + *dst++ = make_block_q4_0x8(dst_tmp, interleave_block); + } + src += nrows_interleaved * nblocks; + } + return 0; + + GGML_UNUSED(data_size); +} + +static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) { + block_iq4_nlx4 out; + + for (int i = 0; i < 4; i++) { + out.d[i] = in[i].d; + } + + const int end = QK4_NL * 2 / blck_size_interleave; + + // TODO: this branch seems wrong + //if (blck_size_interleave == 8) { + // for (int i = 0; i < end; ++i) { + // int src_id = i % 4; + // int src_offset = (i / 4) * blck_size_interleave; + // int dst_offset = i * blck_size_interleave; + + // // Using memcpy to avoid unaligned memory accesses + // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); + // } + //} else + if (blck_size_interleave == 4) { + for (int i = 0; i < end; ++i) { + int src_id = i % 4; + int src_offset = (i / 4) * blck_size_interleave; + int dst_offset = i * blck_size_interleave; + + memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t)); + } + } else { + GGML_ASSERT(false); + } + + return out; +} + +static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { + GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); + //GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + GGML_ASSERT(interleave_block == 4); + + block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; + const block_iq4_nl * src = (const block_iq4_nl *)data; + block_iq4_nl dst_tmp[4]; + int nrow = ggml_nrows(t); + int nrows_interleaved = 4; + int nblocks = t->ne[0] / QK4_0; + + GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); + + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + + for (int b = 0; b < nrow; b += nrows_interleaved) { + for (int64_t x = 0; x < nblocks; x++) { + for (int i = 0; i < nrows_interleaved; i++) { + dst_tmp[i] = src[x + i * nblocks]; + } + *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block); + } + src += nrows_interleaved * nblocks; + } + return 0; + + GGML_UNUSED(data_size); +} + +namespace ggml::cpu::repack { +// repack +template +int repack(struct ggml_tensor *, const void *, size_t); + +// TODO: generalise. +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); +} + +// TODO: needs to be revisited +//template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { +// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); +//} + +// gemv +template +void gemv(int, float *, size_t, const void *, const void *, int, int); + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +// gemm +template +void gemm(int, float *, size_t, const void *, const void *, int, int); + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +class tensor_traits_base : public ggml::cpu::tensor_traits { + public: + virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; +}; + +template class tensor_traits : public tensor_traits_base { + + bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + // not realy a GGML_TYPE_Q8_0 but same size. + switch (op->op) { + case GGML_OP_MUL_MAT: + { + size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); + return true; + } + case GGML_OP_MUL_MAT_ID: + { + size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); + size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc. + + const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert + const int64_t ne12 = op->src[1]->ne[2]; // n_tokens + + const size_t sizeof_mmid_row_mapping = sizeof(int64_t); + + size += sizeof_mmid_row_mapping*ne02*(ne12 + 1); + + return true; + } + default: + // GGML_ABORT("fatal error"); + break; + } + return false; + } + + bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { + switch (op->op) { + case GGML_OP_MUL_MAT: + forward_mul_mat(params, op); + return true; + case GGML_OP_MUL_MAT_ID: + forward_mul_mat_id(params, op); + return true; + default: + // GGML_ABORT("fatal error"); + break; + } + return false; + } + + void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); + // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); + + char * wdata = static_cast(params->wdata); + const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); + + assert(params->wsize >= nbw1 * ne11); + + const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; + + int64_t i11_processed = 0; + for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { + ggml_quantize_mat_t((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10); + } + + i11_processed = ne11 - ne11 % 4; + for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); + } + + ggml_barrier(params->threadpool); + + const void * src1_wdata = params->wdata; + const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); + int64_t src0_start = (ith * ne01) / nth; + int64_t src0_end = ((ith + 1) * ne01) / nth; + src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start; + src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end; + if (src0_start >= src0_end) { + return; + } + + // If there are more than three rows in src1, use gemm; otherwise, use gemv. + if (ne11 > 3) { + gemm(ne00, + (float *) ((char *) dst->data) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + } + for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { + gemv(ne00, + (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata + (src1_col_stride * iter), 1, + src0_end - src0_start); + } + } + + void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + const ggml_tensor * ids = op->src[2]; + ggml_tensor * dst = op; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne03 == 1); + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(ne3 == 1); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // row groups + const int n_ids = ids->ne[0]; // n_expert_used + const int n_as = ne02; // n_expert + + const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + struct mmid_row_mapping { + int32_t i1; + int32_t i2; + }; + + GGML_ASSERT(params->wsize >= + (GGML_PAD(nbw3, sizeof(int64_t)) + + n_as*(ne12 + 1)*sizeof(mmid_row_mapping)) + ); + + auto * wdata = (char *)params->wdata; + auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t)); + + // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t) + auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12] + + // src1: float32 => param type + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11), + (void *) (wdata + i12 * nbw2 + i11 * nbw1), + ne10); + } + } + +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)] + + if (ith == 0) { + // initialize matrix_row_counts + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + + // group rows by src0 matrix + for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { + for (int32_t id = 0; id < n_ids; ++id) { + const int32_t i02 = + *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); + + GGML_ASSERT(i02 >= 0 && i02 < n_as); + + MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 }; + matrix_row_counts[i02] += 1; + } + } + } + + ggml_barrier(params->threadpool); + + // compute each matrix multiplication in sequence + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + + if (cne1 == 0) { + continue; + } + + const auto * src0_cur = (const char *) src0->data + cur_a*nb02; + + //const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1; // src1 rows + + int64_t src0_cur_start = (ith * ne01) / nth; + int64_t src0_cur_end = ((ith + 1) * ne01) / nth; + + src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start; + src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end; + + if (src0_cur_start >= src0_cur_end) { + return; + } + + for (int ir1 = 0; ir1 < nr1; ir1++) { + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); + + const int id = row_mapping.i1; // selected expert index + + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 + + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row + + const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2); + + gemv(ne00, + (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, + src0_cur + src0_cur_start * nb01, + src1_col, 1, src0_cur_end - src0_cur_start); + } + } +#undef MMID_MATRIX_ROW + } + + int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { + GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), + (int) NB_COLS, (int) INTER_SIZE); + return ggml::cpu::repack::repack(t, data, data_size); + } +}; + +} // namespace ggml::cpu::repack + +static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) { + + // instance for Q4 + static const ggml::cpu::repack::tensor_traits q4_0_4x4_q8_0; + static const ggml::cpu::repack::tensor_traits q4_0_4x8_q8_0; + static const ggml::cpu::repack::tensor_traits q4_0_8x8_q8_0; + static const ggml::cpu::repack::tensor_traits q4_K_8x8_q8_K; + + // instance for IQ4 + static const ggml::cpu::repack::tensor_traits iq4_nl_4x4_q8_0; + + if (cur->type == GGML_TYPE_Q4_0) { + if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if (cur->ne[1] % 8 == 0) { + return &q4_0_8x8_q8_0; + } + } + if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { + if (cur->ne[1] % 4 == 0) { + return &q4_0_4x8_q8_0; + } + } + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { + if (cur->ne[1] % 4 == 0) { + return &q4_0_4x4_q8_0; + } + } + } else if (cur->type == GGML_TYPE_Q4_K) { + if (ggml_cpu_has_avx2()) { + if (cur->ne[1] % 8 == 0) { + return &q4_K_8x8_q8_K; + } + } + } else if (cur->type == GGML_TYPE_IQ4_NL) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { + if (cur->ne[1] % 4 == 0) { + return &iq4_nl_4x4_q8_0; + } + } + } + + return nullptr; +} + +static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + tensor->extra = (void *) const_cast(ggml_repack_get_optimal_repack_type(tensor)); + + GGML_UNUSED(buffer); + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra; + auto OK = tensor_traits->repack(tensor, data, size); + + GGML_ASSERT(OK == 0); + GGML_UNUSED(buffer); +} + +static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_REPACK"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + + if (buffer == nullptr) { + return nullptr; + } + + buffer->buft = buft; + buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor; + buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor; + buffer->iface.get_tensor = nullptr; + buffer->iface.cpy_tensor = nullptr; + return buffer; +} + +static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return TENSOR_ALIGNMENT; + + GGML_UNUSED(buft); +} + +namespace ggml::cpu::repack { +class extra_buffer_type : ggml::cpu::extra_buffer_type { + bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + if ( op->op == GGML_OP_MUL_MAT && + op->src[0]->buffer && + (ggml_n_dims(op->src[0]) == 2) && + op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() && + ggml_repack_get_optimal_repack_type(op->src[0]) + ) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + //if (op->src[1]->type == GGML_TYPE_Q8_0) { + // return true; + //} + // may be possible if Q8_0 packed... + } else if (op->op == GGML_OP_MUL_MAT_ID + && op->src[0]->buffer + && (ggml_n_dims(op->src[0]) == 3) + && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() + && ggml_repack_get_optimal_repack_type(op->src[0]) + ) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + //if (op->src[1]->type == GGML_TYPE_Q8_0) { + // return true; + //} + } + return false; + } + + ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { + if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) { + return (ggml::cpu::tensor_traits *) op->src[0]->extra; + } + } + return nullptr; + } +}; +} // namespace ggml::cpu::repack + +ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ nullptr, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ new ggml::cpu::repack::extra_buffer_type(), + }; + + return &ggml_backend_cpu_buffer_type_repack; +} diff --git a/ggml/src/ggml-cpu/repack.h b/ggml/src/ggml-cpu/repack.h new file mode 100644 index 0000000000000..4421e5f8e7046 --- /dev/null +++ b/ggml/src/ggml-cpu/repack.h @@ -0,0 +1,98 @@ +#pragma once + +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" + +#include "traits.h" +#include "ggml.h" + +// GGML internal header + +ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void); + +template constexpr int QK_0() { + if constexpr (K == 4) { + return QK4_0; + } + if constexpr (K == 8) { + return QK8_0; + } + return -1; +} + +template struct block { + ggml_half d[N]; // deltas for N qK_0 blocks + int8_t qs[(QK_0() * N * K) / 8]; // quants for N qK_0 blocks +}; + +// control size +static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding"); +static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding"); +static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding"); +static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding"); + +using block_q4_0x4 = block<4, 4>; +using block_q4_0x8 = block<4, 8>; +using block_q8_0x4 = block<8, 4>; +using block_q8_0x8 = block<8, 8>; + +struct block_q4_Kx8 { + ggml_half d[8]; // super-block scale for quantized scales + ggml_half dmin[8]; // super-block scale for quantized mins + uint8_t scales[96]; // scales and mins, quantized with 6 bits + uint8_t qs[1024]; // 4--bit quants +}; + +static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding"); + +struct block_q8_Kx4 { + float d[4]; // delta + int8_t qs[QK_K * 4]; // quants + int16_t bsums[QK_K / 4]; // sum of quants in groups of 16 +}; + +static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding"); + +struct block_iq4_nlx4 { + ggml_half d[4]; // deltas for 4 iq4_nl blocks + uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks +}; + +static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); + +#if defined(__cplusplus) +extern "C" { +#endif + +void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +// Native implementations +void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); +void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); + +#if defined(__cplusplus) +} // extern "C" +#endif diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index 2e3669c0186c9..b68ac0dd68b40 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -2,10 +2,167 @@ #include "ggml-cpu-impl.h" +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + +#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include +#endif + +#if defined(__F16C__) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + // // simd mappings // +// FP16 to FP32 conversion + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +// +// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 +// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 +// +#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x) + + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + + static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; + } + + static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __fp16 tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; + } +#elif defined(__F16C__) + #ifdef _MSC_VER + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) + #else + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) + #endif +#elif defined(__POWER9_VECTOR__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x) + /* the inline asm below is about 12% faster than the lookup method */ + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) + + static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) { + float f; + double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; + } + + static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) { + double d; + ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; + } +#elif defined(__riscv) && defined(__riscv_zfhmin) + static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) { + float f; + __asm__( + "fmv.h.x %[f], %[h]\n\t" + "fcvt.s.h %[f], %[f]" + : [f] "=&f" (f) + : [h] "r" (h) + ); + return f; + } + + static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __asm__( + "fcvt.h.s %[f], %[f]\n\t" + "fmv.x.h %[h], %[f]" + : [h] "=&r" (res) + : [f] "f" (f) + ); + return res; + } + + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x) + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) +#elif defined(__NNPA__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x) + + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) + + static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) { + uint16x8_t v_h = vec_splats(h); + uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0); + return vec_extend_to_fp32_hi(v_hd, 0)[0]; + } + + static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) { + float32x4_t v_f = vec_splats(f); + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0); + uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0); + return vec_extract(v_h, 0); + } +#endif + +// precomputed f32 table for f16 (256 KB) +// defined in ggml-cpu.c, initialized in ggml_cpu_init() +extern float ggml_table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_CPU_FP16_TO_FP32) +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#endif + +#if !defined(GGML_CPU_FP32_TO_FP16) +#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif + + // we define a common set of C macros which map to specific intrinsics based on the current architecture // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros @@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { float tmp[8]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { _mm256_storeu_ps(arr, y); for (int i = 0; i < 8; i++) - x[i] = GGML_FP32_TO_FP16(arr[i]); + x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); } #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) @@ -574,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) { inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(p[0]); - tmp[1] = GGML_FP16_TO_FP32(p[1]); - tmp[2] = GGML_FP16_TO_FP32(p[2]); - tmp[3] = GGML_FP16_TO_FP32(p[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]); return wasm_v128_load(tmp); } @@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { wasm_v128_store(tmp, x); - p[0] = GGML_FP32_TO_FP16(tmp[0]); - p[1] = GGML_FP32_TO_FP16(tmp[1]); - p[2] = GGML_FP32_TO_FP16(tmp[2]); - p[3] = GGML_FP32_TO_FP16(tmp[3]); + p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]); + p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]); + p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]); + p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]); } #define GGML_F16x4 v128_t @@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); return _mm_loadu_ps(tmp); } @@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) { _mm_storeu_ps(arr, y); - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); + x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); + x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); + x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); + x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); } #define GGML_F32Cx4 __m128 @@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { #define GGML_F32x4_ZERO __lsx_vldi(0) #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0) -#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0) +#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0) #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) #define GGML_F32x4_ADD __lsx_vfadd_s #define GGML_F32x4_MUL __lsx_vfmul_s @@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); return __lsx_vld(tmp, 0); } @@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { __lsx_vst(y, arr, 0); - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); + x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); + x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); + x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); + x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); } #define GGML_F32Cx4 __m128 @@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F32_STEP 32 #define GGML_F32_EPR 4 -#define GGML_F32x4 __vector float +#define GGML_F32x4 float32x4_t #define GGML_F32x4_ZERO vec_splats(0.0f) #define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_LOAD(p) vec_xl(0, p) @@ -944,10 +1101,8 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { for (int i = 0; i < offset; ++i) { \ x[i] = vec_add(x[i], x[offset + i]); \ } \ - res = vec_extract(x[0], 0) + \ - vec_extract(x[0], 1) + \ - vec_extract(x[0], 2) + \ - vec_extract(x[0], 3); \ + float32x4_t tmp = x[0] + vec_reve(x[0]); \ + res = tmp[0] + tmp[1]; \ } #define GGML_F32_VEC GGML_F32x4 @@ -964,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F16_STEP GGML_F32_STEP #define GGML_F16_EPR GGML_F32_EPR -static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) { +static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) { +#if defined(__NNPA__) + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x); + uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0); + return vec_extend_to_fp32_hi(v_xd, 0); +#else float tmp[4]; for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } // note: keep type-cast here to prevent compiler bugs // see: https://github.com/ggml-org/llama.cpp/issues/12846 return vec_xl(0, (const float *)(tmp)); +#endif } -static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) { +static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) { +#if defined(__NNPA__) + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0); + uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0); + + x[0] = vec_extract(v_x, 0); + x[1] = vec_extract(v_x, 1); + x[2] = vec_extract(v_x, 2); + x[3] = vec_extract(v_x, 3); +#else float arr[4]; // note: keep type-cast here to prevent compiler bugs // see: https://github.com/ggml-org/llama.cpp/issues/12846 - vec_xst(y, 0, (float *)(arr)); + vec_xst(v_y, 0, (float *)(arr)); for (int i = 0; i < 4; i++) { - x[i] = GGML_FP32_TO_FP16(arr[i]); + x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); } +#endif } #define GGML_F16_VEC GGML_F32x4 @@ -1006,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) { #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) #endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp b/ggml/src/ggml-cpu/traits.cpp similarity index 97% rename from ggml/src/ggml-cpu/ggml-cpu-traits.cpp rename to ggml/src/ggml-cpu/traits.cpp index 62a0712dabbf6..139fa59641440 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +++ b/ggml/src/ggml-cpu/traits.cpp @@ -1,4 +1,4 @@ -#include "ggml-cpu-traits.h" +#include "traits.h" #include "ggml-backend-impl.h" #include "ggml-backend.h" diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ggml/src/ggml-cpu/traits.h similarity index 100% rename from ggml/src/ggml-cpu/ggml-cpu-traits.h rename to ggml/src/ggml-cpu/traits.h diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index f7614568ea388..5e34d79a1695f 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -219,11 +219,11 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G // leftovers for (int i = np; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); } #else for (int i = 0; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); } #endif diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 09dbade2179fb..84f6c0e6d26c4 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } @@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } @@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i])); + y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i])); } } inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i])); } } @@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG // leftovers for (int i = np; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); } } #else for (int i = 0; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); } } #endif @@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, // leftovers for (int i = np; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); } #else // scalar for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); } #endif } @@ -430,12 +430,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float // leftovers for (int i = np; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); } #else // scalar for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); } #endif } @@ -444,103 +444,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v*v); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v*v); } } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); } inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); } inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); } } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); + y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); } } inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f); } } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); } } inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i])))); + y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i])))); } } // TODO: optimize performance inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); } } inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); + y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); } } inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); } inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i]))); } } @@ -562,9 +562,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float xi = GGML_FP16_TO_FP32(x[i]); + float xi = GGML_CPU_FP16_TO_FP32(x[i]); float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); - y[i] = GGML_FP32_TO_FP16(res); + y[i] = GGML_CPU_FP32_TO_FP16(res); } } @@ -577,9 +577,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { } else if (x[i] >= 10.0f) { y[i] = x[i]; } else { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]); } } } @@ -613,9 +613,9 @@ inline static float ggml_gelu_quick_f32(float x) { inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); } } #else @@ -628,8 +628,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v)))); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v)))); } } @@ -638,8 +638,8 @@ inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) { - float v = GGML_FP16_TO_FP32(x); - return GGML_FP32_TO_FP16(v/(1.0f + expf(-v))); + float v = GGML_CPU_FP16_TO_FP32(x); + return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v))); } #if __FINITE_MATH_ONLY__ @@ -888,9 +888,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) { } inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) { - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); const float s = 1.0f/(1.0f + expf(-v)); - return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); + return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); } inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { @@ -928,7 +928,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { float sum = 0.0f; for (int i = 0; i < n; ++i) { - sum += GGML_FP16_TO_FP32(x[i]); + sum += GGML_CPU_FP16_TO_FP32(x[i]); } *s = sum; } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index e1ce1d4cd1558..ea20355023825 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -19,10 +19,10 @@ #endif #include "ggml-common.h" -#include #include #include #include +#include #include #include @@ -76,11 +76,9 @@ #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1) // Moore Threads -#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210) - -#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 -#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 -#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD +#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 +#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 +#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) @@ -203,13 +201,13 @@ typedef float2 dfloat2; #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) #define FP16_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) -#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4)) +#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4))) #define FP16_MMA_AVAILABLE -#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4)) +#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4))) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING #define NEW_MMA_AVAILABLE @@ -219,9 +217,9 @@ typedef float2 dfloat2; #define CP_ASYNC_AVAILABLE #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE -#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1) +#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) #define FLASH_ATTN_AVAILABLE -#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1) +#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) static bool fp16_available(const int cc) { return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL; @@ -233,7 +231,8 @@ static bool fast_fp16_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fast_fp16_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } // Any FP16 tensor core instructions are available for ggml code. @@ -241,15 +240,35 @@ static bool fp16_mma_available(const int cc) { #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) return false; #else - return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); + if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || + GGML_CUDA_CC_IS_MTHREADS(cc)) { + return true; + } else if (GGML_CUDA_CC_IS_RDNA4(cc)) { +#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12) + return true; +#else + return false; +#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12) + } else { + return false; + } #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) } // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fp16_mma_hardware_available(const int cc) { return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); +} + +static bool bf16_mma_hardware_available(const int cc) { + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; +} + +static bool fp32_mma_hardware_available(const int cc) { + return GGML_CUDA_CC_IS_CDNA(cc); } // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. @@ -262,11 +281,11 @@ static bool cp_async_available(const int cc) { } static constexpr __device__ int ggml_cuda_get_physical_warp_size() { -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) - return __AMDGCN_WAVEFRONT_SIZE; +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__)) + return 64; #else return 32; -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__)) } [[noreturn]] @@ -362,6 +381,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #endif // FP16_AVAILABLE } +// Row reduction kernel template - compute sum (norm=false) or mean (norm=true) +template +static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) { + const int row = blockIdx.x; + const int col = threadIdx.x; + + float sum = 0.0f; + for (int i = col; i < ncols; i += blockDim.x) { + sum += x[row * ncols + i]; + } + + sum = warp_reduce_sum(sum); + + if (col != 0) { + return; + } + + dst[row] = norm ? sum / ncols : sum; +} + template static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll @@ -466,9 +505,6 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } -// TODO: move to ggml-common.h -static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); static __device__ __forceinline__ float get_alibi_slope( @@ -770,21 +806,7 @@ struct ggml_backend_cuda_context { name(GGML_CUDA_NAME + std::to_string(device)) { } - ~ggml_backend_cuda_context() { - if (copy_event != nullptr) { - CUDA_CHECK(cudaEventDestroy(copy_event)); - } - for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) { - for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) { - if (streams[i][j] != nullptr) { - CUDA_CHECK(cudaStreamDestroy(streams[i][j])); - } - } - if (cublas_handles[i] != nullptr) { - CUBLAS_CHECK(cublasDestroy(cublas_handles[i])); - } - } - } + ~ggml_backend_cuda_context(); cudaStream_t stream(int device, int stream) { if (streams[device][stream] == nullptr) { diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu new file mode 100644 index 0000000000000..7583233b1b7cd --- /dev/null +++ b/ggml/src/ggml-cuda/conv2d-dw.cu @@ -0,0 +1,161 @@ +#include "conv2d-dw.cuh" + +struct conv_params { + int in_w, in_h; + int out_w, out_h; + int kernel_w, kernel_h; + int stride_x, stride_y; + int padding_x, padding_y; + int dilation_x, dilation_y; + int channels, batches; +}; + +struct kernel_bounds { + int y_min, y_max; + int x_min, x_max; +}; + +__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) { + kernel_bounds bounds; + bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); + bounds.y_max = + min(params.kernel_h, + (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y); + bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); + bounds.x_max = + min(params.kernel_w, + (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x); + return bounds; +} + +__device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) { + return out_coord * stride + kern_coord * dilation - padding; +} + +struct whcn_layout { + __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) { + return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x; + } + + __device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) { + return c * params.kernel_h * params.kernel_w + ky * params.kernel_w + kx; + } + + __device__ static int output_index(int n, int c, int y, int x, const conv_params & params) { + return n * (params.channels * params.out_w * params.out_h) + c * params.out_w * params.out_h + + y * params.out_w + x; + } + + __device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y, + int & out_x) { + out_x = global_idx % params.out_w; + out_y = (global_idx / params.out_w) % params.out_h; + c = (global_idx / (params.out_w * params.out_h)) % params.channels; + n = global_idx / (params.out_w * params.out_h * params.channels); + } +}; + +struct cwhn_layout { + __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) { + return n * (params.channels * params.in_w * params.in_h) + (y * params.in_w + x) * params.channels + c; + } + + __device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) { + return (ky * params.kernel_w + kx) * params.channels + c; + } + + __device__ static int output_index(int n, int c, int y, int x, const conv_params & params) { + return n * (params.channels * params.out_w * params.out_h) + y * (params.out_w * params.channels) + + x * params.channels + c; + } + + __device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y, + int & out_x) { + c = global_idx % params.channels; + out_x = (global_idx / params.channels) % params.out_w; + out_y = (global_idx / (params.channels * params.out_w)) % params.out_h; + n = global_idx / (params.channels * params.out_w * params.out_h); + } +}; + +template +__global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restrict__ kernel, T * __restrict__ output, + const int in_w, const int in_h, const int out_w, const int out_h, + const int kernel_w, const int kernel_h, const int stride_x, const int stride_y, + const int padding_x, const int padding_y, const int dilation_x, const int dilation_y, + const int channels, const int batches) { + const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; + const int total_elements = batches * channels * out_h * out_w; + + if (global_idx >= total_elements) { + return; + } + + conv_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, + stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches }; + + int batch_idx, channel_idx, out_y_idx, out_x_idx; + Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx); + + T accumulator = 0; + kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params); + + for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) { + int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y); + + for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) { + int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x); + + const T input_val = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)]; + const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)]; + + accumulator += input_val * kernel_val; + } + } + + output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator; +} + +void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * input = dst->src[1]; + + GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + const float * w_d = (const float *) kernel->data; + const float * x_d = (const float *) input->data; + float * y_d = (float *) dst->data; + + const int32_t * p = (const int32_t *) dst->op_params; + const int stride_x = p[0]; + const int stride_y = p[1]; + const int padding_x = p[2]; + const int padding_y = p[3]; + const int dilation_x = p[4]; + const int dilation_y = p[5]; + + const int in_w = input->ne[0]; + const int in_h = input->ne[1]; + const int kernel_w = kernel->ne[0]; + const int kernel_h = kernel->ne[1]; + const int out_w = dst->ne[0]; + const int out_h = dst->ne[1]; + const int channels = dst->ne[2]; + const int batches = dst->ne[3]; + + cudaStream_t st = ctx.stream(); + + const int total = batches * channels * out_h * out_w; + const int blocks = (total + CUDA_CONV2D_DW_BLOCK_SIZE - 1) / CUDA_CONV2D_DW_BLOCK_SIZE; + + if (ggml_is_contiguous(input)) { + conv2d_dw_kernel<<>>( + x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y, + dilation_x, dilation_y, channels, batches); + } else if (ggml_is_contiguous_channels(input)) { + conv2d_dw_kernel<<>>( + x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y, + dilation_x, dilation_y, channels, batches); + } else { + GGML_ABORT("Unsupported memory layout for conv_2d_dw"); + } +} diff --git a/ggml/src/ggml-cuda/conv2d-dw.cuh b/ggml/src/ggml-cuda/conv2d-dw.cuh new file mode 100644 index 0000000000000..b5d5a69d345cf --- /dev/null +++ b/ggml/src/ggml-cuda/conv2d-dw.cuh @@ -0,0 +1,5 @@ +#pragma once +#include "common.cuh" + +#define CUDA_CONV2D_DW_BLOCK_SIZE 256 +void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu new file mode 100644 index 0000000000000..03224e404d32d --- /dev/null +++ b/ggml/src/ggml-cuda/conv2d-transpose.cu @@ -0,0 +1,91 @@ +#include + +#include "conv2d-transpose.cuh" +#include "ggml.h" + +__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel, + float * __restrict__ output, const int in_w, const int in_h, const int out_w, + const int out_h, const int kernel_w, const int kernel_h, const int stride, + const int c_in, const int c_out, const int batches) { + const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; + + const int total_elements = out_w * out_h * c_out * batches; + + if (global_idx >= total_elements) { + return; + } + + const int out_x_idx = global_idx % out_w; + const int out_y_idx = (global_idx / out_w) % out_h; + const int c_idx = (global_idx / (out_w * out_h)) % c_out; + const int n_idx = global_idx / (out_w * out_h * c_out); + + float accumulator = 0; + // For each output idx, find the inputs that contribute to it by checking stride alignment and bounds + + for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) { + for (int kh = 0; kh < kernel_h; ++kh) { + int in_y = out_y_idx - kh; + if (in_y < 0 || in_y % stride) continue; + in_y /= stride; + if (in_y >= in_h) continue; + + for (int kw = 0; kw < kernel_w; ++kw) { + int in_x = out_x_idx - kw; + if (in_x < 0 || in_x % stride) continue; + in_x /= stride; + if (in_x >= in_w) continue; + + const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x; + const int kernel_idx = + (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw; + + float input_val = input[input_idx]; + half kern_val = kernel[kernel_idx]; + + accumulator += input_val * (float) kern_val; + } + } + } + + output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator; +} + +//input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in) +void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * input = dst->src[1]; + + GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + + const float * input_data = (const float *) input->data; + float * output_data = (float *) dst->data; + const half * kernel_data = (const half *) kernel->data; + + const int input_w = input->ne[0]; + const int input_h = input->ne[1]; + const int output_w = dst->ne[0]; + const int output_h = dst->ne[1]; + const int channels_in = input->ne[2]; + const int channels_out = kernel->ne[2]; + const int kernel_w = kernel->ne[0]; + const int kernel_h = kernel->ne[1]; + const int stride = dst->op_params[0]; + const int batches = input->ne[3]; + + GGML_ASSERT(channels_in == kernel->ne[3]); + GGML_ASSERT(stride > 0); + + cudaStream_t st = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous(input)); + GGML_ASSERT(ggml_is_contiguous(kernel)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + const int total = (output_w * output_h * channels_out * batches); + const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE; + + conv2d_transpose_kernel<<>>( + input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride, + channels_in, channels_out, batches); +} diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cuh b/ggml/src/ggml-cuda/conv2d-transpose.cuh new file mode 100644 index 0000000000000..c9430b2485021 --- /dev/null +++ b/ggml/src/ggml-cuda/conv2d-transpose.cuh @@ -0,0 +1,4 @@ +#include "common.cuh" + +#define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256 +void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 925f39e890db9..e230f6d494d77 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -652,9 +652,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( float KQ_max_scale[cols_per_thread]; #pragma unroll for (int col = 0; col < cols_per_thread; ++col) { - KQ_max_scale[col] = expf(KQ_max[col] - KQ_max_new[col]); + const float KQ_max_diff = KQ_max[col] - KQ_max_new[col]; + KQ_max_scale[col] = expf(KQ_max_diff); KQ_max[col] = KQ_max_new[col]; + *((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD; + // Scale previous KQ_rowsum to account for a potential increase in KQ_max: KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_rowsum_add[col]; } diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index c5668adb152b2..f3b794c3644c8 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -9,7 +9,11 @@ #ifdef FP16_MMA_AVAILABLE #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #include +#ifdef GGML_USE_MUSA +namespace wmma = mtmusa::wmma; +#else // GGML_USE_MUSA namespace wmma = nvcuda::wmma; +#endif // GGML_USE_MUSA #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE) #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers #include diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 2a6f7f108b3f8..b30c13c62f25c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -11,6 +11,8 @@ #include "ggml-cuda/clamp.cuh" #include "ggml-cuda/concat.cuh" #include "ggml-cuda/conv-transpose-1d.cuh" +#include "ggml-cuda/conv2d-dw.cuh" +#include "ggml-cuda/conv2d-transpose.cuh" #include "ggml-cuda/convert.cuh" #include "ggml-cuda/count-equal.cuh" #include "ggml-cuda/cpy.cuh" @@ -35,6 +37,7 @@ #include "ggml-cuda/ssm-scan.cuh" #include "ggml-cuda/sum.cuh" #include "ggml-cuda/sumrows.cuh" +#include "ggml-cuda/mean.cuh" #include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/unary.cuh" #include "ggml-cuda/upscale.cuh" @@ -47,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -54,9 +58,8 @@ #include #include #include -#include -#include #include +#include #include #include #include @@ -97,8 +100,7 @@ int ggml_cuda_get_device() { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); cudaError_t err; - if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) - { + if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) { err = cudaMallocManaged(ptr, size); #if defined(GGML_USE_HIP) if (err == hipSuccess) { @@ -116,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) err = cudaMalloc(ptr, size); } #endif // defined(GGML_USE_HIP) - } - else - { + } else { err = cudaMalloc(ptr, size); } return err; @@ -514,6 +514,33 @@ std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(i return std::unique_ptr(new ggml_cuda_pool_leg(device)); } +// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error +// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured + +static std::mutex ggml_cuda_lock; +static std::condition_variable ggml_cuda_lock_cv; +static std::atomic ggml_cuda_lock_counter; + +ggml_backend_cuda_context::~ggml_backend_cuda_context() { + std::unique_lock lock(ggml_cuda_lock); + ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; }); + + if (copy_event != nullptr) { + CUDA_CHECK(cudaEventDestroy(copy_event)); + } + for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) { + for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) { + if (streams[i][j] != nullptr) { + CUDA_CHECK(cudaStreamDestroy(streams[i][j])); + } + } + if (cublas_handles[i] != nullptr) { + CUBLAS_CHECK(cublasDestroy(cublas_handles[i])); + } + } +} + + // cuda buffer struct ggml_backend_cuda_buffer_context { @@ -615,9 +642,8 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { @@ -1144,7 +1170,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)( static cudaError_t ggml_cuda_cpy_tensor_2d( void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { - GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer)); const char * src_ptr = (const char *) src->data; char * dst_ptr = (char *) dst; @@ -1202,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas( const int cc = ggml_cuda_info().devices[id].cc; + const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); + const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT; - if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { + if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { ggml_cuda_pool_alloc src1_as_bf16(ctx.pool(id)); if (src1->type != GGML_TYPE_BF16) { const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type); @@ -1232,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas( const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); - } else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) { + } else if (fast_fp16_hardware_available(cc) && use_fp16) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); if (src0->type != GGML_TYPE_F16) { @@ -1427,8 +1455,6 @@ static void ggml_cuda_op_mul_mat( const int64_t nb2 = dst->nb[2]; const int64_t nb3 = dst->nb[3]; - GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer)); - GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer)); ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context; @@ -1750,7 +1776,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); + GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft)); GGML_ASSERT(src0->type == GGML_TYPE_F16); // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst. @@ -1920,16 +1946,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src; bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) - && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % 2 == 0 && src1->ne[1] == 1; + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; - bool any_gpus_with_slow_fp16 = false; - bool any_gpus_without_fp16_mma = false; + bool any_gpus_with_slow_fp16 = false; if (split) { ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; @@ -1940,16 +1964,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor continue; } - const int cc = ggml_cuda_info().devices[id].cc; - use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); - any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); - any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc); + const int cc = ggml_cuda_info().devices[id].cc; + use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); + use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]); + any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); } } else { - const int cc = ggml_cuda_info().devices[ctx.device].cc; - use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); - any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); - any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc); + const int cc = ggml_cuda_info().devices[ctx.device].cc; + use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); + use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]); + any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); } // debug helpers @@ -1960,7 +1984,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { + if (!split && use_mul_mat_vec) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst); @@ -2314,6 +2338,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_IM2COL: ggml_cuda_op_im2col(ctx, dst); break; + case GGML_OP_CONV_2D_DW: + ggml_cuda_op_conv2d_dw(ctx, dst); + break; + case GGML_OP_CONV_TRANSPOSE_2D: + ggml_cuda_conv_2d_transpose_p0(ctx, dst); + break; case GGML_OP_CONV_TRANSPOSE_1D: ggml_cuda_op_conv_transpose_1d(ctx,dst); break; @@ -2326,6 +2356,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SUM_ROWS: ggml_cuda_op_sum_rows(ctx, dst); break; + case GGML_OP_MEAN: + ggml_cuda_op_mean(ctx, dst); + break; case GGML_OP_SSM_CONV: ggml_cuda_op_ssm_conv(ctx, dst); break; @@ -2668,7 +2701,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft))); } } -#endif +#else + GGML_UNUSED(integrated); +#endif // NDEBUG bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); if (!ok) { @@ -2687,6 +2722,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph)); graph_evaluated_or_captured = true; // CUDA graph has been captured + + std::lock_guard lock(ggml_cuda_lock); + if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) { + ggml_cuda_lock_cv.notify_all(); + } } else { graph_evaluated_or_captured = true; // ggml graph has been directly evaluated } @@ -2762,7 +2802,13 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, } } - if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture + if (use_cuda_graph && cuda_graph_update_required) { + // Start CUDA graph capture + { + std::lock_guard lock(ggml_cuda_lock); + ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed); + } + CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); } @@ -3018,9 +3064,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; } #ifdef GGML_USE_MUSA - if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 && - !ggml_is_transposed(a) && !ggml_is_transposed(b)) { - return false; + const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; + if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) { + if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT && + a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) { + return false; + } + if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID && + a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) { + return false; + } } #endif // GGML_USE_MUSA switch (a->type) { @@ -3047,11 +3100,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_BF16: -#ifdef GGML_USE_MUSA - if (a->type == GGML_TYPE_Q3_K) { - return false; - } -#endif // GGML_USE_MUSA return true; default: return false; @@ -3211,9 +3259,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]); } case GGML_OP_IM2COL: + case GGML_OP_CONV_2D_DW: + case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_POOL_2D: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: case GGML_OP_ARGSORT: case GGML_OP_ACC: return true; diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu new file mode 100644 index 0000000000000..4b238a3998ba3 --- /dev/null +++ b/ggml/src/ggml-cuda/mean.cu @@ -0,0 +1,19 @@ +#include "mean.cuh" + +void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *) src0->data; + float * dst_d = (float *) dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums(nrows, 1, 1); + reduce_rows_f32<<>>(src0_d, dst_d, ncols); +} diff --git a/ggml/src/ggml-cuda/mean.cuh b/ggml/src/ggml-cuda/mean.cuh new file mode 100644 index 0000000000000..2b9b10433438e --- /dev/null +++ b/ggml/src/ggml-cuda/mean.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index d8c385e2399ae..e14c93516bddf 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -2,25 +2,26 @@ #include "common.cuh" #include "mmv.cuh" -template +template static __global__ void mul_mat_vec( const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, - const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row, - const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, - const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) { - const int64_t row = blockIdx.x; - const int64_t channel_dst = blockIdx.y; - const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; - const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst; - const int64_t sample_dst = blockIdx.z; - const int64_t sample_x = sample_dst / sample_ratio; - const int64_t sample_y = sample_dst; - const int tid = threadIdx.x; + const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst, + const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { + const int row = blockIdx.x; + const int channel_dst = blockIdx.y; + const int channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; + const int channel_y = ids ? channel_dst % nchannels_y : channel_dst; + const int sample_dst = blockIdx.z; + const int sample_x = sample_dst / sample_ratio; + const int sample_y = sample_dst; + const int tid = threadIdx.x; + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row; - y += sample_y *stride_sample_y + channel_y *stride_channel_y; - dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst; + x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; + y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; + dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; const float2 * y2 = (const float2 *) y; @@ -34,81 +35,108 @@ static __global__ void mul_mat_vec( __syncthreads(); } - float sumf = 0.0f; + float sumf[ncols_dst] = {0.0f}; if constexpr (std::is_same::value) { const float2 * x2 = (const float2 *) x; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + for (int col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = x2[col2]; - const float2 tmpy = y2[col2]; - sumf += tmpx.x*tmpy.x; - sumf += tmpx.y*tmpy.y; + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const float2 tmpy = y2[j*stride_col_y2 + col2]; + sumf[j] += tmpx.x*tmpy.x; + sumf[j] += tmpx.y*tmpy.y; + } } } else if constexpr (std::is_same::value) { const half2 * x2 = (const half2 *) x; if (std::is_same::value) { - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + for (int col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = __half22float2(x2[col2]); - const float2 tmpy = y2[col2]; - sumf += tmpx.x * tmpy.x; - sumf += tmpx.y * tmpy.y; + +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const float2 tmpy = y2[j*stride_col_y2 + col2]; + sumf[j] += tmpx.x * tmpy.x; + sumf[j] += tmpx.y * tmpy.y; + } } } else { #ifdef FP16_AVAILABLE - half2 sumh2 = make_half2(0.0f, 0.0f); + half2 sumh2[ncols_dst] = {{0.0f, 0.0f}}; + + for (int col2 = tid; col2 < ncols2; col2 += block_size) { + const half2 tmpx = x2[col2]; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { - const float2 tmp = y2[col2]; - sumh2 += x2[col2] * make_half2(tmp.x, tmp.y); +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const float2 tmpy = y2[j*stride_col_y2 + col2]; + sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y); + } } - sumf = __low2float(sumh2) + __high2float(sumh2); +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]); + } #else NO_DEVICE_CODE; #endif // FP16_AVAILABLE } } else if constexpr (std::is_same::value) { const int * x2 = (const int *) x; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { - const int tmpx = x2[col2]; - const float2 tmpy = y2[col2]; - sumf += float(reinterpret_cast(&tmpx)[0]) * tmpy.x; - sumf += float(reinterpret_cast(&tmpx)[1]) * tmpy.y; + for (int col2 = tid; col2 < ncols2; col2 += block_size) { + const int tmpx = x2[col2]; +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + const float2 tmpy = y2[j*stride_col_y2 + col2]; + sumf[j] += float(reinterpret_cast(&tmpx)[0]) * tmpy.x; + sumf[j] += float(reinterpret_cast(&tmpx)[1]) * tmpy.y; + } } } else { static_assert(std::is_same::value, "unsupported type"); } - sumf = warp_reduce_sum(sumf); +#pragma unroll + for (int j = 0; j < ncols_dst; ++j) { + sumf[j] = warp_reduce_sum(sumf[j]); - if (block_size > warp_size) { - buf_iw[tid/warp_size] = sumf; - __syncthreads(); - if (tid >= warp_size) { - return; + if (block_size > warp_size) { + buf_iw[tid/warp_size] = sumf[j]; + __syncthreads(); + if (tid < warp_size) { + sumf[j] = buf_iw[tid]; + sumf[j] = warp_reduce_sum(sumf[j]); + } + if (j < ncols_dst) { + __syncthreads(); + } } - sumf = buf_iw[tid]; - sumf = warp_reduce_sum(sumf); } - if (tid != 0) { + if (tid >= ncols_dst) { return; } - dst[row] = sumf; + dst[tid*stride_col_dst + row] = sumf[tid]; } -template +template static void launch_mul_mat_vec_cuda( const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t ncols, const int64_t nrows, + const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, cudaStream_t stream) { - GGML_ASSERT(ncols % 2 == 0); - GGML_ASSERT(stride_row % 2 == 0); + GGML_ASSERT(ncols % 2 == 0); + GGML_ASSERT(stride_row % 2 == 0); + GGML_ASSERT(stride_col_y % 2 == 0); GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); GGML_ASSERT( nsamples_dst % nsamples_x == 0); const int64_t channel_ratio = nchannels_dst / nchannels_x; @@ -138,44 +166,52 @@ static void launch_mul_mat_vec_cuda( const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { case 32: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 64: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 96: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 128: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 160: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 192: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 224: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 256: { - mul_mat_vec<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, - stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + mul_mat_vec<<>> + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; default: { GGML_ABORT("fatal error"); @@ -183,23 +219,91 @@ static void launch_mul_mat_vec_cuda( } } +template +static void mul_mat_vec_cuda_switch_ncols_dst( + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols, const int64_t nrows, const int64_t ncols_dst, + const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + cudaStream_t stream) { + switch (ncols_dst) { + case 1: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 2: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 3: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 4: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 5: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 6: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 7: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + case 8: + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; + default: + GGML_ABORT("fatal error"); + break; + } +} + template static void mul_mat_vec_cuda( const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t ncols, const int64_t nrows, const int64_t ncols_dst, + const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, enum ggml_prec prec, cudaStream_t stream) { if constexpr(std::is_same::value) { if (prec == GGML_PREC_DEFAULT) { - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + mul_mat_vec_cuda_switch_ncols_dst + (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); return; } } - launch_mul_mat_vec_cuda - (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + mul_mat_vec_cuda_switch_ncols_dst + (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); } @@ -246,24 +350,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const int64_t stride_channel_dst = ids ? s1 : s2; const int64_t stride_channel_y = ids ? s11 : s12; - GGML_ASSERT(ncols_dst == 1); + GGML_ASSERT(!ids || ncols_dst == 1); switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; @@ -282,16 +386,19 @@ void ggml_cuda_op_mul_mat_vec( GGML_ASSERT(dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne0 = dst->ne[0]; const int64_t row_diff = row_high - row_low; - GGML_ASSERT(src1_ncols == 1); - - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; // ggml_cuda_op provides single, contiguous matrices const int64_t stride_row = ne00; + const int64_t stride_col_y = ne10; + const int64_t stride_col_dst = id == ctx.device ? ne0 : row_diff; // main device has larger memory buffer const int64_t nchannels_x = 1; const int64_t nchannels_y = 1; const int64_t nchannels_dst = 1; @@ -307,19 +414,19 @@ void ggml_cuda_op_mul_mat_vec( switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; @@ -334,3 +441,66 @@ void ggml_cuda_op_mul_mat_vec( GGML_UNUSED(src1_ncols); GGML_UNUSED(src1_padded_row_size); } + +bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) { + if (src0_ne[0] % 2 != 0) { + return false; + } + switch (type) { + case GGML_TYPE_F32: + if (GGML_CUDA_CC_IS_NVIDIA(cc)) { + if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { + return ne11 <= 8; + } + if (cc >= GGML_CUDA_CC_TURING) { + return ne11 <= 4; + } + return ne11 <= 3; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (fp32_mma_hardware_available(cc)) { + return ne11 <= 3; + } + return ne11 <= 8; + } + return ne11 <= 8; + case GGML_TYPE_F16: + if (GGML_CUDA_CC_IS_NVIDIA(cc)) { + const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1); + if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { + return src0_small && ne11 <= 4; + } + if (fp16_mma_hardware_available(cc)) { + return src0_small && ne11 <= 3; + } + return ne11 <= 8; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (fp16_mma_hardware_available(cc)) { + if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { + return ne11 <= 5; + } + return ne11 <= 2; + } + return ne11 <= 8; + } + return ne11 <= 8; + case GGML_TYPE_BF16: + if (GGML_CUDA_CC_IS_NVIDIA(cc)) { + const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1); + if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { + return src0_small && ne11 <= 4; + } + if (bf16_mma_hardware_available(cc)) { + return src0_small && ne11 <= 3; + } + return ne11 <= 8; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (bf16_mma_hardware_available(cc)) { + return ne11 <= 3; + } + return ne11 <= 8; + } + return ne11 <= 8; + default: + return false; + } +} diff --git a/ggml/src/ggml-cuda/mmv.cuh b/ggml/src/ggml-cuda/mmv.cuh index 756e7e1cc7fc3..1330bcb6a8860 100644 --- a/ggml/src/ggml-cuda/mmv.cuh +++ b/ggml/src/ggml-cuda/mmv.cuh @@ -1,8 +1,5 @@ #include "common.cuh" -// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available -#define MMV_MAX_ROWS 512 - void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); void ggml_cuda_op_mul_mat_vec( @@ -10,3 +7,5 @@ void ggml_cuda_op_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); + +bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11); diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu index 37ee208c09d46..2d34b836054f8 100644 --- a/ggml/src/ggml-cuda/ssm-scan.cu +++ b/ggml/src/ggml-cuda/ssm-scan.cu @@ -10,6 +10,8 @@ __global__ void __launch_bounds__(splitD, 2) float * __restrict__ dst, const int64_t L) { GGML_UNUSED(src1_nb0); GGML_UNUSED(src2_nb0); + + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); const int bidx = blockIdx.x; // split along B const int bidy = blockIdx.y; // split along D const int tid = threadIdx.x; @@ -44,16 +46,16 @@ __global__ void __launch_bounds__(splitD, 2) if (N == 16) { #pragma unroll for (size_t i = 0; i < splitD / 4; i += 2) { - float value = A_block[(wid * warpSize + i) * stride_A + wtid]; + float value = A_block[(wid * warp_size + i) * stride_A + wtid]; // todo: bank conflict // I am always confused with how to use the swizzling method to solve // bank conflit. Hoping somebody can tell me. - smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value; + smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value; } #pragma unroll for (size_t i = 0; i < splitD / 4; i += 2) { - float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid]; - smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value; + float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid]; + smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value; } } diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu index 38dbf1b5e1fa9..2eee08fa07375 100644 --- a/ggml/src/ggml-cuda/sumrows.cu +++ b/ggml/src/ggml-cuda/sumrows.cu @@ -1,25 +1,9 @@ #include "sumrows.cuh" -static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { - const int row = blockIdx.x; - const int col = threadIdx.x; - - float sum = 0.0f; - for (int i = col; i < ncols; i += blockDim.x) { - sum += x[row * ncols + i]; - } - - sum = warp_reduce_sum(sum); - - if (col == 0) { - dst[row] = sum; - } -} - void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_nums(nrows, 1, 1); - k_sum_rows_f32<<>>(x, dst, ncols); + reduce_rows_f32<<>>(x, dst, ncols); } void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ncols = src0->ne[0]; const int64_t nrows = ggml_nrows(src0); - sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream); + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums(nrows, 1, 1); + + reduce_rows_f32<<>>(src0_d, dst_d, ncols); } diff --git a/ggml/src/ggml-cuda/sumrows.cuh b/ggml/src/ggml-cuda/sumrows.cuh index 191db1c13167e..3431c599b1b89 100644 --- a/ggml/src/ggml-cuda/sumrows.cuh +++ b/ggml/src/ggml-cuda/sumrows.cuh @@ -1,5 +1,4 @@ #include "common.cuh" void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream); - void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt new file mode 100644 index 0000000000000..d644300387e32 --- /dev/null +++ b/ggml/src/ggml-hexagon/CMakeLists.txt @@ -0,0 +1,139 @@ +project(ggml-hexagon) +message(STATUS "Using HEXAGON backend") +message("CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}") + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if(NOT DEFINED QNN_SDK_PATH) + message(FATAL_ERROR "QNN_SDK_PATH not defined") +endif() + +if(NOT DEFINED HEXAGON_SDK_PATH) + message(FATAL_ERROR "HEXAGON_SDK_PATH not defined") +endif() + +message("QNN_SDK_PATH : ${QNN_SDK_PATH}") +message("HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}") +message("HTP_ARCH_VERSION: ${HTP_ARCH_VERSION}") + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(DEBUG_FLAG "-DDEBUG -Wall") + message("Debug mode:${DEBUG_FLAG}") +else() + set(DEBUG_FLAG "-DNDEBUG -Wall") +#manually disable all verbose logs in ggml-hexagon/CMakeLists.txt to +#make compare NPU performance through llama-bench more clear +#set(DEBUG_FLAG "-DNDEBUG -Wall -DDISABLE_ALL_LOG") + message("Release mode:${DEBUG_FLAG}") +endif() + +#v68 --- Snapdragon 888 +#v69 --- Snapdragon 8 Gen1 +#v73 --- Snapdragon 8 Gen2 +#v75 --- Snapdragon 8 Gen3 +#v79 --- Snapdragon 8 Elite(aka Gen4) +if(NOT DEFINED HTP_ARCH_VERSION) + message(FATAL_ERROR "HTP_ARCH_VERSION not defined, valid htp arch: v68,v69,v73,v75,v79") +endif() + +#check whether user's specified htp arch is valid +set(CHECK_HTP_ARCH "WRONG") +#ref: https://github.com/quic/ai-hub-apps/tree/main/tutorials/llm_on_genie +#foreach (feat v68 v69 v73 v75 v79) +#foreach (feat v73 v75 v79) +#for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite +foreach (feat v75 v79) + if (${feat} STREQUAL ${HTP_ARCH_VERSION}) + set(CHECK_HTP_ARCH "GOOD") + endif() +endforeach() +if (${CHECK_HTP_ARCH} STREQUAL "WRONG") + #message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79") + #for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite + message(FATAL_ERROR "ggml-hexagon backend only support htp arch v75,v79") +endif() + +#check optimization flags +set(OPT_FLAG " ") +if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79") + #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend + set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only") +endif() +message("OPT_FLAG:${OPT_FLAG}") + +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log) + + add_library(cdsprpc + SHARED + IMPORTED) + set_target_properties(cdsprpc + PROPERTIES + IMPORTED_LOCATION + ${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so) + + set(QNN_LINK_LIBRARIES ${LOG_LIB} cdsprpc) + set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") + + include_directories(${HEXAGON_SDK_PATH}/incs) + include_directories(${HEXAGON_SDK_PATH}/incs/stddef) + include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/incs) + include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc) + include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_Debug_aarch64) + include_directories(${HEXAGON_SDK_PATH}/utils/examples) + include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/android_aarch64) + include_directories(${HEXAGON_SDK_PATH}/libs/atomic/inc) + include_directories(${HEXAGON_SDK_PATH}/libs/atomic/android_Debug_aarch64/ship) + include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-hexagon/) + include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-hexagon/kernels/) +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend") +else() + message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)") +endif() + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}") +set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}") + +file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/stub.c") +ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES}) + +target_include_directories(ggml-hexagon PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR}) +target_link_libraries(ggml-hexagon PRIVATE ${QNN_LINK_LIBRARIES}) + +string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(ggml-hexagon PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") + +#cross compiling source codes of hexagon kernels which running on cDSP side +function(ggml_hexagon_build_kernel KNAME) + message(STATUS "ggml_hexagon: build hexagon-kernel ${KNAME}") + + add_custom_command( + TARGET ${PROJECT_NAME} + POST_BUILD + COMMAND echo "current working path:`pwd`\n" + COMMAND echo "${CMAKE_CURRENT_LIST_DIR}/kernels" + COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean + COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION} DEBUG_FLAG=${DEBUG_FLAG} + COMMAND echo "current working path:`pwd`\n" + COMMAND ls -l ../../../bin/libggmldsp-skel.so + COMMENT "build hexagon-kernel" + ) +endfunction() + +function(ggml_hexagon_setup_cfg KNAME) + message(STATUS "ggml_hexagon: setup runtime configuration file ${KNAME}") + add_custom_command( + TARGET ${PROJECT_NAME} + POST_BUILD + COMMAND echo "current working path:`pwd`\n" + COMMAND /bin/cp -fv ../../../../../scripts/${KNAME} ../../../bin/ + COMMENT "setup runtime configuration file" + ) +endfunction() + +ggml_hexagon_build_kernel("cdsp") +ggml_hexagon_setup_cfg("ggml-hexagon.cfg") diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp new file mode 100644 index 0000000000000..a8ab81a5cdc70 --- /dev/null +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -0,0 +1,7003 @@ +/* + * Copyright (c) 2024-2025 The ggml authors + * + * Qualcomm QNN SDK and reference tech guides could be found at: + * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk + * Qualcomm Hexagon SDK and reference tech guides could be found at: + * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools + * + * this single-source-file or self-contained implementation of ggml-hexagon backend has 8 sections: + * section-1 forward/prototype declaration, global vars, macros, data structures + * section-2 internal troubleshooting function/class + * section-3 helper function for WoA(Windows on ARM) + * section-4 general helper function + * section-5 QNN helper function/class + * section-6 implementation of hwaccel approach through QNN: offload ggmlop to QNN + * section-7 cDSP helper function + * section-8 implementation of ggml-hexagon backend according to specification in ggml backend subsystem + * + * currently provide following ggml op' implementation through QNN: + * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV/GGML_OP_LOG/GGML_OP_SQRT: + * this is a simple hwaccel skeleton, can expand other ggml ops according to expertise + * - GGML_OP_MUL_MAT: + * this is a complicated hwaccel skeleton, can expand other ggml ops accordingly + * + * currently provide following ggml op' implementation through cDSP in hexagon-kernels: + * - GGML_OP_ADD & GGML_OP_MUL_MAT: + * this is a hwaccel skeleton, can expand other ggml ops accordingly + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__ANDROID__) || defined(__linux__) +#include +#include +#include +#include +#include +#include +#include +#endif + +#if !defined(__ANDROID__) && !defined(__linux__) +#include +#include +#include +#endif + +#if defined(__ANDROID__) +#include "android/log.h" + +#include "rpcmem.h" +#include "remote.h" +#include "os_defines.h" +#include "domain.h" +#include "AEEStdErr.h" +#include "HAP_power.h" +#include "HAP_farf.h" +#endif + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" +#include "HTP/QnnHtpGraph.h" + +#include "ggml-hexagon.h" +#include "ggml-impl.h" +#include "ggml-backend-impl.h" + +#include "kernels/skel.h" + +// ================================================================================================= +// section-1: forward/prototype declaration, global vars, macros, data structures +// ================================================================================================= +class qnn_instance; +class hexagon_profiler; +struct ggml_backend_hexagon_context; + +#ifdef NDEBUG +#define GGMLHEXAGON_DEBUG 0 +#else +#define GGMLHEXAGON_DEBUG 1 +#endif + +#ifndef PROJECT_NAME +#define PROJECT_NAME "ggml-hexagon" +#endif + +#define GGMLHEXAGON_LOGBUF_LEN 4096 +#define GGMLHEXAGON_TMPBUF_LEN 256 + +#define GGMLHEXAGON_LOG_ERROR(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define GGMLHEXAGON_LOG_WARN(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if !defined (DISABLE_ALL_LOG) +#define GGMLHEXAGON_LOG_INFO(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define GGMLHEXAGON_LOG_VERBOSE(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +//manually disable all verbose logs in ggml-hexagon/CMakeLists.txt to +//make compare NPU performance through llama-bench more clear +#define GGMLHEXAGON_LOG_INFO(...) +#define GGMLHEXAGON_LOG_VERBOSE(...) +#endif + +#if GGMLHEXAGON_DEBUG +#define GGMLHEXAGON_LOG_DEBUG(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define GGMLHEXAGON_LOG_DEBUG(...) +#endif + +#define QNN_VER_PTR(x) (&((x).v1)) +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 +#define SIZE_IN_MB (1 << 20) +#define STATUS_CONTEXT 0x12345678 + +#if !defined (_WINDOWS) +#pragma weak remote_system_request +#pragma weak remote_session_control +#endif + +#define CHECK_QNN_API(error, result) \ + do { \ + error = (result); \ + if (QNN_SUCCESS != error) { \ + if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) { \ + GGMLHEXAGON_LOG_WARN("WARNING: QNN feature/API not supported\n"); \ + } else { \ + GGMLHEXAGON_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_qnnerror_string(error)); \ + } \ + } \ + } while (0) + +#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (g_hexagon_appcfg.hwaccel_approach != HWACCEL_CDSP) { \ + if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } \ + } while (0) \ + +#ifndef ggmlop_URI +#define ggmlop_URI "file:///libggmldsp-skel.so?ggmldsp_skel_handle_invoke&_modver=1.0&_idlver=0.0.1" +#endif +// ================================================================================================= +// section-1: data type, data structure, global vars +// ================================================================================================= +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); + +//QNN resource management for the general approach through QNN +using qnn_tensors_t = std::vector< Qnn_Tensor_t >; +using qnn_ptensors_t = std::vector< Qnn_Tensor_t *>; +using qnn_singlenode_res_t = std::tuple; + +typedef void (* ggmlqnn_op_func_t)(ggml_backend_hexagon_context * ctx, ggml_tensor * op); +typedef int (* notify_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status); +typedef int (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst); + +enum qnn_index_type { + QNN_TENSOR_INDEX = 0, + QNN_OPCFG_INDEX = 1, +}; + +enum qnn_profile_level { + PROFILE_OFF = 0, + PROFILE_BASIC = 1, + PROFILE_DETAIL = 2, +}; + +enum hexagon_dsp_type { + HEXAGON_ADSP = 0, + HEXAGON_MDSP = 1, + HEXAGON_SDSP = 2, + HEXAGON_CDSP = 3, + HEXAGON_CDSP1 = 4, +}; + +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, + V79 = 79, +}; + +enum qcom_chipset_soc_model { + UNKNOWN_SM = 0, + SM7450 = 41, // v69, 7 Gen1 + SM8350 = 30, // v68, 888 + SM8450 = 36, // v69, SD 8 Gen 1 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SM8650 = 57, // v75, SD 8 Gen 3 + SM8750 = 69, // v79, SD 8 Elite +#if !defined(__ANDROID__) && !defined(__linux__) + SC7280X = 44, + SC8280X = 37, + SC8380XP = 60, +#endif +}; + +//borrowed from Android source code, might not be accurate +enum ion_heap_ids { + INVALID_HEAP_ID = -1, + ION_CP_MM_HEAP_ID = 8, + ION_SECURE_HEAP_ID = 9, + ION_SECURE_DISPLAY_HEAP_ID = 10, + ION_CP_MFC_HEAP_ID = 12, + ION_SPSS_HEAP_ID = 13, + ION_CP_WB_HEAP_ID = 16, + ION_CAMERA_HEAP_ID = 20, + ION_SYSTEM_CONTIG_HEAP_ID = 21, + ION_ADSP_HEAP_ID = 22, + ION_PIL1_HEAP_ID = 23, + ION_SF_HEAP_ID = 24, + ION_SYSTEM_HEAP_ID = 25, + ION_PIL2_HEAP_ID = 26, + ION_QSECOM_HEAP_ID = 27, + ION_AUDIO_HEAP_ID = 28, + ION_MM_FIRMWARE_HEAP_ID = 29, + ION_HEAP_ID_RESERVED = 31 +}; + +struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; + char soc_desc[GGML_MAX_NAME]; +}; + +struct ggml_backend_hexagon_context { + int device; + char name[GGML_MAX_NAME]; + char desc[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; + struct qcom_socinfo socinfo; + + //QNN resource management for the general approach through QNN + std::map qnn_singlenode_graph_map; + + //quantize data -> fp32 + std::unique_ptr work_data; + std::vector> tasks; + size_t work_size; + size_t desired_size; + int n_threads; + + //Hexagon resource management for the general approach through Hexagaon cDSP + size_t rpc_mempool_capacity; + size_t rpc_mempool_len; + size_t rpc_mempool_usage; + void * rpc_mempool; + int rpc_mempool_handle; + remote_handle64 ggmlop_handle; + int domain_id; +}; + +struct qnn_op_caps { + bool supported; + ggml_op op; + const size_t input_param_count; + const char * qnn_op_name; +}; + +struct hexagon_op_caps { + bool supported; + ggml_op op; + const size_t input_param_count; + const char * hexagon_op_name; + ggmlhexagon_op_func_t dsp_op_func; +}; + +struct hexagon_appcfg_t { + int print_qnn_internal_log; // enable/disable QNN's internal log + int enable_perf; // enable/disable perf of a specified ggml op + int enable_profiler; // enable/disable profiler feature to visualization comparison between HWACCEL_CDSP and HWACCEL_QNN + int print_tensors_info; // enable/disable print tensors info in op function + int dump_op_info; // enable/disable dump op info in handle_op + int enable_q_mulmat; // enable/disable offload quantized mulmat + int enable_pinned_memory; // enable/disable pinned-memory feature + int precision_mode; // 0: default 1:fp16 + int hvx_threads; + int vtcm_size_in_mb; + int enable_dlbc; + int hwaccel_approach; // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP + int hexagon_backend; // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU 3: HEXAGON_BACKEND_CDSP 4: ggml + int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool + int enable_all_q_mulmat; // enable/disable offload all quantized type mulmat to cDSP + int profiler_duration; // threshold of duration in profiler, per seconds + int profiler_counts; // threshold of counts in profiler + int thread_counts; // thread_counts on cDSP side + int mulmat_algotype; // algorithm type of mulmat on cDSP side + const char * cfgfilename; + const char * runtime_libpath; + char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN]; + char ggml_dsp_version[GGMLHEXAGON_TMPBUF_LEN]; +}; + +static struct hexagon_appcfg_t g_hexagon_appcfg = { + .print_qnn_internal_log = 0, + .enable_perf = 1, + .enable_profiler = 0, + .print_tensors_info = 0, + .dump_op_info = 0, + .enable_q_mulmat = 0, + .enable_pinned_memory = 0, + .precision_mode = 0, + .hvx_threads = 4, + .vtcm_size_in_mb = 8, + .enable_dlbc = 1, + .hwaccel_approach = HWACCEL_CDSP, + .hexagon_backend = HEXAGON_BACKEND_CDSP, + .enable_rpc_ion_mempool = 0, + .enable_all_q_mulmat = 0, + .profiler_duration = 5, //seconds + .profiler_counts = 100, + .thread_counts = 4, + .mulmat_algotype = 0, + .cfgfilename = "ggml-hexagon.cfg", +#if defined(__ANDROID__) + #if defined(STANDARD_ANDROID_APP) + .runtime_libpath = "/data/data/com.kantvai.kantvplayer/", + #else + .runtime_libpath = "/data/local/tmp/", + #endif +#elif defined(__linux__) + .qnn_runtimelib_path = "/tmp/", +#elif defined(_WIN32) + .qnn_runtimelib_path = "C:\\", +#endif + .ggml_hexagon_version = {"1.13"}, + .ggml_dsp_version = {"0.63"}, +}; + +//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices +static struct qcom_socinfo g_qnn_soc_info_table[] = { + /* Qualcomm SnapDragon 7 Gen 1 */ + { + .soc_model = SM7450, + .htp_arch = V69, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 7 Gen 1"}, + + /* Qualcomm SnapDragon 888 */ + { + .soc_model = SM8350, + .htp_arch = V68, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 888 "}, + + /* Qualcomm SnapDragon 8 Gen 1 */ + { + .soc_model = SM8450, + .htp_arch = V69, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Gen 1"}, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + { + .soc_model = SM8475, + .htp_arch = V69, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Gen 1+"}, + + /* Qualcomm SnapDragon 8 Gen 2 */ + { + .soc_model = SM8550, + .htp_arch = V73, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Gen 2"}, + + /* Qualcomm SnapDragon 8 Gen 3 */ + { + .soc_model = SM8650, + .htp_arch = V75, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Gen 3 "}, + + /* Qualcomm SnapDragon 8 Gen 4 */ + { + .soc_model = SM8750, + .htp_arch = V79, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Elite"}, + +#if !defined(__ANDROID__) && !defined(__linux__) + /* Qualcomm SnapDragon 7c Gen 2 */ + { + .soc_model = SC7280X, + .htp_arch = V68, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 7c Gen 2"}, + + /* Qualcomm SnapDragon 8cx Gen 3 */ + { + .soc_model = SC8280X, + .htp_arch = V68, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8cx Gen 3"}, + + /* Qualcomm SnapDragon 8cx Gen 4 */ + { + .soc_model = SC8380XP, + .htp_arch = V73, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8cx Gen 4"}, +#endif + +}; + +// file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html +// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend +// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend +// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICES] = { + { .device = 0, + .name = "qnn-cpu", + .desc = "Qualcomm Kryo CPU", +#if !defined(__ANDROID__) && !defined(__linux__) + .lib = "QnnCpu.dll", +#else + .lib = "libQnnCpu.so", +#endif + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {}, + .qnn_singlenode_graph_map = {}, + .work_data = nullptr, + .tasks = {}, + .work_size = 0, + .desired_size = 0, + .n_threads = 8, + .rpc_mempool_capacity = 0, + .rpc_mempool_len = 0, + .rpc_mempool_usage = 0, + .rpc_mempool = nullptr, + .rpc_mempool_handle = 0, + .ggmlop_handle = 0, + .domain_id = -1, + }, + + { .device = 1, + .name = "qnn-gpu", + .desc = "Qualcomm Adreno GPU", +#if !defined(__ANDROID__) && !defined(__linux__) + .lib = "QnnGpu.dll", +#else + .lib = "libQnnGpu.so", +#endif + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {}, + .qnn_singlenode_graph_map = {}, + .work_data = nullptr, + .tasks = {}, + .work_size = 0, + .desired_size = 0, + .n_threads = 8, + .rpc_mempool_capacity = 0, + .rpc_mempool_len = 0, + .rpc_mempool_usage = 0, + .rpc_mempool = nullptr, + .rpc_mempool_handle = 0, + .ggmlop_handle = 0, + .domain_id = -1, + }, + + { .device = 2, + .name = "qnn-npu", + .desc = "Qualcomm NPU(Hexagon Tensor Processor)", +#if !defined(__ANDROID__) && !defined(__linux__) + .lib = "QnnHtp.dll", +#else + .lib = "libQnnHtp.so", +#endif + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {}, + .qnn_singlenode_graph_map = {}, + .work_data = nullptr, + .tasks = {}, + .work_size = 0, + .desired_size = 0, + .n_threads = 8, + .rpc_mempool_capacity = 0, + .rpc_mempool_len = 0, + .rpc_mempool_usage = 0, + .rpc_mempool = nullptr, + .rpc_mempool_handle = 0, + .ggmlop_handle = 0, + .domain_id = -1, + }, + { .device = 3, + .name = "Hexagon-cDSP", + .desc = "Qualcomm NPU(cDSP)", + .lib = "", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {}, + .qnn_singlenode_graph_map = {}, + .work_data = nullptr, + .tasks = {}, + .work_size = 0, + .desired_size = 0, + .n_threads = 8, + .rpc_mempool_capacity = 0, + .rpc_mempool_len = 0, + .rpc_mempool_usage = 0, + .rpc_mempool = nullptr, + .rpc_mempool_handle = 0, + .ggmlop_handle = 0, + .domain_id = HEXAGON_CDSP, + }, +}; + +static domain hexagon_supported_domains[] = { + {ADSP_DOMAIN_ID, ADSP_DOMAIN}, + {MDSP_DOMAIN_ID, MDSP_DOMAIN}, + {SDSP_DOMAIN_ID, SDSP_DOMAIN}, + {CDSP_DOMAIN_ID, CDSP_DOMAIN}, + {CDSP1_DOMAIN_ID, CDSP1_DOMAIN} +}; + +//supported ggml op by HWACCEL_QNN +static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = { + {true, GGML_OP_NONE, 0, nullptr}, + {false, GGML_OP_DUP, 0, nullptr}, + {true, GGML_OP_ADD, 2, QNN_OP_ELEMENT_WISE_ADD}, + {false, GGML_OP_ADD1, 0, nullptr}, + {false, GGML_OP_ACC, 0, nullptr}, + {true, GGML_OP_SUB, 2, QNN_OP_ELEMENT_WISE_SUBTRACT}, + {true, GGML_OP_MUL, 2, QNN_OP_ELEMENT_WISE_MULTIPLY}, + {true, GGML_OP_DIV, 2, QNN_OP_ELEMENT_WISE_DIVIDE}, + {false, GGML_OP_SQR, 0, nullptr}, + {true, GGML_OP_SQRT, 1, QNN_OP_ELEMENT_WISE_SQUARE_ROOT}, + {true, GGML_OP_LOG, 1, QNN_OP_ELEMENT_WISE_LOG}, + {false, GGML_OP_SIN, 0, nullptr}, + {false, GGML_OP_COS, 0, nullptr}, + {false, GGML_OP_SUM, 0, nullptr}, + {false, GGML_OP_SUM_ROWS, 0, nullptr}, + {false, GGML_OP_MEAN, 0, nullptr}, + {false, GGML_OP_ARGMAX, 0, nullptr}, + {false, GGML_OP_COUNT_EQUAL, 0, nullptr}, + {false, GGML_OP_REPEAT, 0, nullptr}, + {false, GGML_OP_REPEAT_BACK, 0, nullptr}, + {false, GGML_OP_CONCAT, 0, nullptr}, + {false, GGML_OP_SILU_BACK, 0, nullptr}, + {false, GGML_OP_NORM, 0, nullptr}, + {false, GGML_OP_RMS_NORM, 0, nullptr}, + {false, GGML_OP_RMS_NORM_BACK, 0, nullptr}, + {false, GGML_OP_GROUP_NORM, 0, nullptr}, + {false, GGML_OP_L2_NORM, 0, nullptr}, + {true, GGML_OP_MUL_MAT, 2, QNN_OP_MAT_MUL}, + {false, GGML_OP_MUL_MAT_ID, 0, nullptr}, + {false, GGML_OP_OUT_PROD, 0, nullptr}, + {false, GGML_OP_SCALE, 0, nullptr}, + {false, GGML_OP_SET, 0, nullptr}, + {false, GGML_OP_CPY, 0, nullptr}, + {false, GGML_OP_CONT, 0, nullptr}, + {false, GGML_OP_RESHAPE, 0, nullptr}, + {false, GGML_OP_VIEW, 0, nullptr}, + {false, GGML_OP_PERMUTE, 0, nullptr}, + {false, GGML_OP_TRANSPOSE, 0, nullptr}, + {false, GGML_OP_GET_ROWS, 0, nullptr}, + {false, GGML_OP_GET_ROWS_BACK, 0, nullptr}, + {false, GGML_OP_DIAG, 0, nullptr}, + {false, GGML_OP_DIAG_MASK_INF, 0, nullptr}, + {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr}, + {false, GGML_OP_SOFT_MAX, 0, nullptr}, + {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr}, + {false, GGML_OP_ROPE, 0, nullptr}, + {false, GGML_OP_ROPE_BACK, 0, nullptr}, + {false, GGML_OP_CLAMP, 0, nullptr}, + {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr}, + {false, GGML_OP_IM2COL, 0, nullptr}, + {false, GGML_OP_IM2COL_BACK, 0, nullptr}, + {false, GGML_OP_CONV_2D_DW, 0, nullptr}, + {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr}, + {false, GGML_OP_POOL_1D, 0, nullptr}, + {false, GGML_OP_POOL_2D, 0, nullptr}, + {false, GGML_OP_POOL_2D_BACK, 0, nullptr}, + {false, GGML_OP_UPSCALE, 0, nullptr}, + {false, GGML_OP_PAD, 0, nullptr}, + {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr}, + {false, GGML_OP_ROLL, 0, nullptr}, + {false, GGML_OP_ARANGE, 0, nullptr}, + {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr}, + {false, GGML_OP_ARGSORT, 0, nullptr}, + {false, GGML_OP_LEAKY_RELU, 0, nullptr}, + {false, GGML_OP_FLASH_ATTN_EXT, 0, nullptr}, + {false, GGML_OP_FLASH_ATTN_BACK, 0, nullptr}, + {false, GGML_OP_SSM_CONV, 0, nullptr}, + {false, GGML_OP_SSM_SCAN, 0, nullptr}, + {false, GGML_OP_WIN_PART, 0, nullptr}, + {false, GGML_OP_WIN_UNPART, 0, nullptr}, + {false, GGML_OP_GET_REL_POS, 0, nullptr}, + {false, GGML_OP_ADD_REL_POS, 0, nullptr}, + {false, GGML_OP_RWKV_WKV6, 0, nullptr}, + {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr}, + {false, GGML_OP_RWKV_WKV7, 0, nullptr}, + {false, GGML_OP_UNARY, 0, nullptr}, + {false, GGML_OP_MAP_CUSTOM1, 0, nullptr}, + {false, GGML_OP_MAP_CUSTOM2, 0, nullptr}, + {false, GGML_OP_MAP_CUSTOM3, 0, nullptr}, + {false, GGML_OP_CUSTOM, 0, nullptr}, + {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr}, + {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr}, + {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_ABS), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_SGN), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_NEG), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_STEP), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_TANH), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_ELU), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_RELU), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_SIGMOID), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU_ERF), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU_QUICK), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_SILU), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_HARDSWISH), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_EXP), 0, nullptr} +}; + +static_assert(ggmlqnn_k_op_caps[GGML_OP_NONE].supported, "GGML_OP_NONE is not true"); +static_assert(ggmlqnn_k_op_caps[GGML_OP_ADD].supported, "GGML_OP_ADD is not true"); +static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL].supported, "GGML_OP_MUL is not true"); +static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true"); +static_assert(std::size(ggmlqnn_k_op_caps) == (static_cast(GGML_OP_COUNT) + static_cast(GGML_UNARY_OP_COUNT)), + "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h"); + +//supported ggml op by HWACCEL_CDSP +static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = { + {true, GGML_OP_NONE, 0, nullptr, nullptr}, + {false, GGML_OP_DUP, 0, nullptr, nullptr}, + {true, GGML_OP_ADD, 2, "ggmlop_dsp_add", ggmlop_dsp_add}, + {false, GGML_OP_ADD1, 0, nullptr, nullptr}, + {false, GGML_OP_ACC, 0, nullptr, nullptr}, + {false, GGML_OP_SUB, 2, nullptr, nullptr}, + {false, GGML_OP_MUL, 2, nullptr, nullptr}, + {false, GGML_OP_DIV, 2, nullptr, nullptr}, + {false, GGML_OP_SQR, 0, nullptr, nullptr}, + {false, GGML_OP_SQRT, 0, nullptr, nullptr}, + {false, GGML_OP_LOG, 0, nullptr, nullptr}, + {false, GGML_OP_SIN, 0, nullptr, nullptr}, + {false, GGML_OP_COS, 0, nullptr, nullptr}, + {false, GGML_OP_SUM, 0, nullptr, nullptr}, + {false, GGML_OP_SUM_ROWS, 0, nullptr, nullptr}, + {false, GGML_OP_MEAN, 0, nullptr, nullptr}, + {false, GGML_OP_ARGMAX, 0, nullptr, nullptr}, + {false, GGML_OP_COUNT_EQUAL, 0, nullptr, nullptr}, + {false, GGML_OP_REPEAT, 0, nullptr, nullptr}, + {false, GGML_OP_REPEAT_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_CONCAT, 0, nullptr, nullptr}, + {false, GGML_OP_SILU_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_NORM, 0, nullptr, nullptr}, + {true, GGML_OP_RMS_NORM, 1, "ggmlop_dsp_rmsnorm", ggmlop_dsp_rmsnorm}, + {false, GGML_OP_RMS_NORM_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_GROUP_NORM, 0, nullptr, nullptr}, + {false, GGML_OP_L2_NORM, 0, nullptr, nullptr}, + {true, GGML_OP_MUL_MAT, 2, "ggmlop_dsp_mulmat", ggmlop_dsp_mulmat}, + {false, GGML_OP_MUL_MAT_ID, 0, nullptr, nullptr}, + {false, GGML_OP_OUT_PROD, 0, nullptr, nullptr}, + {false, GGML_OP_SCALE, 0, nullptr, nullptr}, + {false, GGML_OP_SET, 0, nullptr, nullptr}, + {false, GGML_OP_CPY, 0, nullptr, nullptr}, + {false, GGML_OP_CONT, 0, nullptr, nullptr}, + {false, GGML_OP_RESHAPE, 0, nullptr, nullptr}, + {false, GGML_OP_VIEW, 0, nullptr, nullptr}, + {false, GGML_OP_PERMUTE, 0, nullptr, nullptr}, + {false, GGML_OP_TRANSPOSE, 0, nullptr, nullptr}, + {false, GGML_OP_GET_ROWS, 0, nullptr, nullptr}, + {false, GGML_OP_GET_ROWS_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_DIAG, 0, nullptr, nullptr}, + {false, GGML_OP_DIAG_MASK_INF, 0, nullptr, nullptr}, + {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr, nullptr}, + {true, GGML_OP_SOFT_MAX, 1, "ggmlop_dsp_softmax", ggmlop_dsp_softmax}, + {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_ROPE, 0, nullptr, nullptr}, + {false, GGML_OP_ROPE_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_CLAMP, 0, nullptr, nullptr}, + {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr, nullptr}, + {false, GGML_OP_IM2COL, 0, nullptr, nullptr}, + {false, GGML_OP_IM2COL_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_CONV_2D_DW, 0, nullptr, nullptr}, + {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr, nullptr}, + {false, GGML_OP_POOL_1D, 0, nullptr, nullptr}, + {true, GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d}, + {false, GGML_OP_POOL_2D_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_UPSCALE, 0, nullptr, nullptr}, + {false, GGML_OP_PAD, 0, nullptr, nullptr}, + {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr, nullptr}, + {false, GGML_OP_ROLL, 0, nullptr, nullptr}, + {false, GGML_OP_ARANGE, 0, nullptr, nullptr}, + {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr, nullptr}, + {false, GGML_OP_ARGSORT, 0, nullptr, nullptr}, + {false, GGML_OP_LEAKY_RELU, 0, nullptr, nullptr}, + {false, GGML_OP_FLASH_ATTN_EXT, 0, nullptr, nullptr}, + {false, GGML_OP_FLASH_ATTN_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_SSM_CONV, 0, nullptr, nullptr}, + {false, GGML_OP_SSM_SCAN, 0, nullptr, nullptr}, + {false, GGML_OP_WIN_PART, 0, nullptr, nullptr}, + {false, GGML_OP_WIN_UNPART, 0, nullptr, nullptr}, + {false, GGML_OP_GET_REL_POS, 0, nullptr, nullptr}, + {false, GGML_OP_ADD_REL_POS, 0, nullptr, nullptr}, + {false, GGML_OP_RWKV_WKV6, 0, nullptr, nullptr}, + {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr, nullptr}, + {false, GGML_OP_RWKV_WKV7, 0, nullptr, nullptr}, + {false, GGML_OP_UNARY, 0, nullptr, nullptr}, + {false, GGML_OP_MAP_CUSTOM1, 0, nullptr, nullptr}, + {false, GGML_OP_MAP_CUSTOM2, 0, nullptr, nullptr}, + {false, GGML_OP_MAP_CUSTOM3, 0, nullptr, nullptr}, + {false, GGML_OP_CUSTOM, 0, nullptr, nullptr}, + {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr, nullptr}, + {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_ABS), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_SGN), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_NEG), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_STEP), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_TANH), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_ELU), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_RELU), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_SIGMOID), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU_ERF), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU_QUICK), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_SILU), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_HARDSWISH), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_EXP), 0, nullptr, nullptr} +}; + +static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported, "GGML_OP_NONE is not true"); +static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported, "GGML_OP_ADD is not true"); +static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true"); +static_assert(ggmlhexagon_k_op_caps[GGML_OP_SOFT_MAX].supported, "GGML_OP_SOFT_MAX is not true"); +static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast(GGML_OP_COUNT) + static_cast(GGML_UNARY_OP_COUNT)), + "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h"); + +static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique +static int32_t g_qnnopcfg_idx = 0; //ensure every QNN opconfig name is unique + +// ================================================================================================= +// section-2: ggml-hexagon internal troubleshooting and profiler function/class +// ================================================================================================= +static const char * ggmlhexagon_get_hwaccel_approach_name(int hwaccle_approach) { + switch (hwaccle_approach) { + case HWACCEL_QNN: + return "HWACCEL_QNN"; + case HWACCEL_QNN_SINGLEGRAPH: + return "HWACCEL_QNN_SINGLEGRAPH"; + case HWACCEL_CDSP: + return "HWACCEL_CDSP"; + default: + return "unknown hwaccel approach"; + } +} + +static void ggmlhexagon_get_timestring(char * p_currenttime) { + if (nullptr == p_currenttime) + return; + + auto time_to_string = [](const std::chrono::system_clock::time_point & tp)->std::string { + auto as_time_t = std::chrono::system_clock::to_time_t(tp); + struct tm tm; + + localtime_r(&as_time_t, &tm); + + std::chrono::milliseconds ms = std::chrono::duration_cast(tp.time_since_epoch()); + char buf[GGMLHEXAGON_TMPBUF_LEN]; + memset(buf, 0, GGMLHEXAGON_TMPBUF_LEN); + snprintf(buf, sizeof(buf), "%04d-%02d-%02d,%02d:%02d:%02d", + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); + GGML_UNUSED(ms); + return buf; + }; + + std::chrono::system_clock::time_point tp = std::chrono::system_clock::now(); + snprintf(p_currenttime, GGMLHEXAGON_TMPBUF_LEN, "%s", time_to_string(tp).c_str()); +} + +static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggmlhexagon_log_internal_mutex; + static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN]; + + GGML_UNUSED(file); +#if !(defined __ANDROID__) || !(defined ANDROID) + GGML_UNUSED(level); +#endif + { + std::lock_guard lock(ggmlhexagon_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggmlhexagon_log_internal_buf, GGMLHEXAGON_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix, GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + __android_log_print(ANDROID_LOG_INFO, PROJECT_NAME, "%s\n", s_ggmlhexagon_log_internal_buf); + if (GGML_LOG_LEVEL_INFO == level) { + printf("%s\n", s_ggmlhexagon_log_internal_buf); + } +#else + //for Snapdragon based WoA(Windows on ARM) device or Linux + printf("%s\n", s_ggmlhexagon_log_internal_buf); +#endif + } + va_end(args); + } +} + +static void ggmlhexagon_get_processname(char * p_name) { + if (nullptr == p_name) + return; + + char tmpbuf[GGMLHEXAGON_TMPBUF_LEN]; + memset(tmpbuf, 0, GGMLHEXAGON_TMPBUF_LEN); +#if defined(__ANDROID__) || defined(__linux__) + int result = readlink("/proc/self/exe", tmpbuf, GGMLHEXAGON_TMPBUF_LEN - 1); + if (result < 0) { + GGMLHEXAGON_LOG_WARN("failed to get process name, reason:%s", strerror(errno)); + return; + } + GGMLHEXAGON_LOG_DEBUG("process name %s", tmpbuf); + const char * realname = strrchr(tmpbuf, '/') + 1; + GGMLHEXAGON_LOG_DEBUG("process name %s", realname); + snprintf(p_name, GGMLHEXAGON_TMPBUF_LEN, "%s", realname); +#endif +} + +static bool ggmlhexagon_is_llamabench_running() { + char processname[GGMLHEXAGON_TMPBUF_LEN]; + memset(processname, 0, GGMLHEXAGON_TMPBUF_LEN); + + ggmlhexagon_get_processname(processname); + if (0 != processname[0] && 0 != processname[1] && 0 != processname[10]) { + if (0 == memcmp(processname, "llama-bench", strlen("llama-bench"))) { + return true; + } + if (0 == memcmp(processname, "test-thread-safety", strlen("test-thread-safety"))) { + return true; + } + } + return false; +} + +static void ggmlhexagon_print_tensors_info(const char * func_name, const ggml_backend_hexagon_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) { + //skip sanity check of params because of performance concern + if (0 == g_hexagon_appcfg.dump_op_info) { + if (0 == g_hexagon_appcfg.print_tensors_info) + return; + } + + if (nullptr != func_name && nullptr != ctx) { + GGMLHEXAGON_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name); + } + if (nullptr != src0) { + GGMLHEXAGON_LOG_DEBUG( + "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3], + src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); + } + if (nullptr != src1) { + GGMLHEXAGON_LOG_DEBUG( + "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->ne[3], + src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); + } + GGMLHEXAGON_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]); + GGMLHEXAGON_LOG_DEBUG("\n"); +} + +static void ggmlhexagon_dump_op_info(const struct ggml_tensor * tensor) { + //skip sanity check of params because of performance concern + if (0 == g_hexagon_appcfg.dump_op_info) + return; + + const struct ggml_tensor * src0 = tensor->src[0]; + struct ggml_tensor * src1 = tensor->src[1]; + struct ggml_tensor * dst = const_cast(tensor); + GGMLHEXAGON_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type)); + ggmlhexagon_print_tensors_info(nullptr, nullptr, src0, src1, dst); +} + +static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) { + float value = 0; + std::ostringstream tmposs; + if (tensor->type == GGML_TYPE_F32) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + if (strlen(tmposs.str().c_str()) <= (GGMLHEXAGON_LOGBUF_LEN - 96)) { + GGMLHEXAGON_LOG_DEBUG("%s\n", tmposs.str().c_str()); + } + tmposs.clear(); + tmposs.str(""); + } + } + } + } + + GGMLHEXAGON_LOG_DEBUG("\n"); +} + +static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, const char * name) { + GGMLHEXAGON_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name); + GGMLHEXAGON_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n", + name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], + tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]); + ggmlhexagon_dump_tensor_elements(tensor); + + GGMLHEXAGON_LOG_DEBUG("\n"); +} + +//a simple high-cohesion and low-coupling class to collect necessary profiler data and visualize NPU performance accordingly +class hexagon_profiler { +public: + static hexagon_profiler & get_instance() { + //make thread-safety without using complex dynamic resource management + static hexagon_profiler instance; + return instance; + } + +public: + void profiler_init(int profiler_threshold_duration, int profiler_threshold_counts) { + reset(); + //here is not accurate profiler start time because inference wasn't launched at the moment + _profiler_starttime = ggml_time_us(); + + _profiler_threshold_duration = profiler_threshold_duration; + _profiler_threshold_counts = profiler_threshold_counts; + + std::string filename = std::string(g_hexagon_appcfg.runtime_libpath) + "/"; + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + if (g_hexagon_appcfg.thread_counts > 1) { + //multi-threading feature enabled on cDSP side + if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) { + filename = filename + "hexagon_perf_cdsp_mt.dat"; + } else { + filename = filename + "hexagon_perf_cdsp_ion_mt.dat"; + } + } else { + if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) { + filename = filename + "hexagon_perf_cdsp.dat"; + } else { + filename = filename + "hexagon_perf_cdsp_ion.dat"; + } + } + } else { + filename = filename + "hexagon_perf_qnn.dat"; + } + GGMLHEXAGON_LOG_DEBUG("profiler name:%s", filename.c_str()); + const char * profiler_filename = filename.c_str(); + _fp_profile_file = fopen(profiler_filename, "w"); + if (nullptr == _fp_profile_file) { + GGMLHEXAGON_LOG_WARN("can't open profiler file %s, reason:%s", profiler_filename, strerror(errno)); + reset(); + return; + } else { + size_t written_size = 0; + char profiler_info[GGMLHEXAGON_TMPBUF_LEN]; + const char * prefix = "### starting hexagon profiler at "; + + written_size = fwrite(prefix, 1, strlen(prefix), _fp_profile_file); + if (written_size != strlen(prefix)) { + GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno)); + profiler_deinit(); + return; + } + + memset(profiler_info, 0, GGMLHEXAGON_TMPBUF_LEN); + ggmlhexagon_get_timestring(profiler_info); + written_size = fwrite(profiler_info, 1, strlen(profiler_info), _fp_profile_file); + if (written_size != strlen(profiler_info)) { + GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno)); + profiler_deinit(); + return; + } + fprintf(_fp_profile_file, "\n\n"); + fprintf(_fp_profile_file, + "#frame input max total avg elapse frame max total avg\n"); + fprintf(_fp_profile_file, + "# inference inference inference inference\n"); + fprintf(_fp_profile_file, + "#index len i-len i-len i-speed time time time time time\n"); + fprintf(_fp_profile_file, "\n\n"); + } + _enable_profiler = true; + } + + void profiler_deinit() { + if (nullptr != _fp_profile_file) { + fclose(_fp_profile_file); + _fp_profile_file = nullptr; + } + reset(); + } + +/** + * \param inference_time microseconds, inference time for a single GGML op + * \param inference_input_size bytes, total input data size for a single GGML op + * \param inference_output_size bytes, total output data size for a single GGML op + */ + void profiler_update_profilerdata(const char * ggml_opname, int inference_time, int inference_input_size, int inference_output_size) { + if (!_enable_profiler) + return; + + //1.get the accurate profiler starting time in this function when frame index is 0 + //2.update frame index in this function accordingly + profiler_update_frameindex(); + + int64_t elapse_time = ggml_time_us() - profiler_get_starttime(); + profiler_update_elapsetime(elapse_time); + if (elapse_time > (_profiler_threshold_duration * SIZE_IN_MB)) { + //do nothing when elapsed profiler time > profiler_duration in ggml-hexagon.cfg + return; + } + if (profiler_get_frame_index() >= _profiler_threshold_counts) { + //do nothing when frame_index >= profiler_counts in ggml-hexagon.cfg + return; + } + + if (inference_input_size > profiler_get_max_inputsize()) { + profiler_set_max_inputsize(inference_input_size); + } + + if (inference_output_size > profiler_get_max_outputsize()) { + profiler_set_max_outputsize(inference_output_size); + } + + if (inference_time > profiler_get_max_inferencetime()) { + profiler_set_max_inferencetime(inference_time); + } + + profiler_update_total_inputsize(inference_input_size); + profiler_update_total_outputsize(inference_output_size); + profiler_update_total_inferencetime(inference_time); + profiler_update_elapsetime(elapse_time); + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + if (10 > _frame_index) { + //FIXME:why some initial profiler data in llama-cli looks unusual + //return; + } + } + + if (0 == elapse_time) { + //filter invalid profiler data + return; + } + + if (NULL != _fp_profile_file) { + fprintf(_fp_profile_file, "%-8d %-6d %-6d %-10ld %-11ld %-10ld %-12d %-9d %-11ld %-3ld\n", + profiler_get_frame_index(), + inference_input_size, + profiler_get_max_inputsize(), + profiler_get_total_inputputsize(), + profiler_get_total_inputputsize() / profiler_get_frame_index(), + + elapse_time, + inference_time, + profiler_get_max_inferencetime(), + profiler_get_total_inferencetime(), + profiler_get_total_inferencetime() / profiler_get_frame_index() + ); + } + + //print/compare NPU's I/O performance between 8Gen3 and 8Elite , removed in the future + char bps_string[GGMLHEXAGON_TMPBUF_LEN]; + memset(bps_string, 0, GGMLHEXAGON_TMPBUF_LEN); + profiler_get_bpsstring(_total_inputsize + _total_outputsize, elapse_time, bps_string); + GGMLHEXAGON_LOG_VERBOSE("I/O performance:%s", bps_string); + } + + int profiler_get_frame_index() { + return _frame_index; + } + + int profiler_get_threshold_count() { + return _profiler_threshold_counts; + } + +private: + void profiler_set_max_inputsize(int input_size) { + _max_inputsize = input_size; + } + + void profiler_set_max_outputsize(int output_size) { + _max_outputsize = output_size; + } + + void profiler_set_max_inferencetime(int inference_time) { + _max_inferencetime = inference_time; + } + + void profiler_update_frameindex() { + if (0 == _frame_index) { + _profiler_starttime = ggml_time_us(); + } + _frame_index += 1; + } + + void profiler_update_elapsetime(int64_t elapse_time_microseconds) { + _profiler_elapsetime = elapse_time_microseconds; + } + + void profiler_update_total_inferencetime(int inference_time) { + _total_inferencetime += inference_time; + } + + void profiler_update_total_inputsize(int input_size) { + _total_inputsize += input_size; + } + + void profiler_update_total_outputsize(int output_size) { + _total_outputsize += output_size; + } + + int profiler_get_max_inputsize() { + return _max_inputsize; + } + + int profiler_get_max_outputsize() { + return _max_outputsize; + } + + int profiler_get_max_inferencetime() { + return _max_inferencetime; + } + + int64_t profiler_get_total_inferencetime() { + return _total_inferencetime; + } + + int64_t profiler_get_total_inputputsize() { + return _total_inputsize; + } + + //might-be used to calculate total I/O performance in the future + int64_t profiler_get_total_outputsize() { + return _total_outputsize; + } + + int64_t profiler_get_starttime() { + return _profiler_starttime; + } + + int64_t profiler_get_elapsedtime() { + return _profiler_elapsetime; + } + + void profiler_get_bpsstring(int64_t data_size, int64_t elapse_time_microseconds, char * bps_string) { + if (nullptr == bps_string) { + return; + } + + float bps = 0.0f; + bps = (data_size * SIZE_IN_MB * 1.0f) / (elapse_time_microseconds * 1.0f); + if (bps >= SIZE_IN_MB) { + snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.2f MiB/s", ((float) bps) / SIZE_IN_MB); + } else if (bps >= 1000) { + snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.1f KiB/s", ((float) bps) / 1000); + } else { + snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.2f B/s", bps); + } + } + + void reset() { + _frame_index = 0; + + _max_inputsize = 0; + _max_outputsize = 0; + _max_inferencetime = 0; + + _total_inputsize = 0; + _total_outputsize = 0; + _total_inferencetime = 0; + + _profiler_starttime = 0; + _profiler_elapsetime = 0; + _fp_profile_file = nullptr; + _enable_profiler = false; + _profiler_threshold_duration = 100; + _profiler_threshold_duration = 5; + } + +private: + hexagon_profiler() { + reset(); + } + + hexagon_profiler(const hexagon_profiler &) = delete; + + hexagon_profiler(const hexagon_profiler &&) = delete; + + hexagon_profiler & operator= (const hexagon_profiler &) = delete; + +private: + int _frame_index; + + int _max_inputsize; //bytes + int _max_outputsize; //bytes + int _max_inferencetime; //bytes + + int64_t _total_inputsize; //bytes + int64_t _total_outputsize; //bytes + int64_t _total_inferencetime; //microsecond + + int64_t _profiler_starttime; //microsecond + int64_t _profiler_elapsetime; //microsecond + FILE * _fp_profile_file; + + bool _enable_profiler; + int _profiler_threshold_duration; //seconds + int _profiler_threshold_counts; +}; +static hexagon_profiler & g_hexagon_profiler = hexagon_profiler::get_instance(); + +//a simple perf class to probe NPU performance +class hexagon_perf { +public: + hexagon_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {} + hexagon_perf(const std::string & perf_name, const char * op_name, int input_size, int output_size) + : _perf_name(std::move(perf_name)), _op_name(op_name), + _input_size(input_size), + _output_size(output_size) { + + } + + void start() { + if (0 == g_hexagon_appcfg.enable_perf) + return; + _begin_time = ggml_time_us(); + } + + //use explicit function calls rather than scoped feature + void info() { + if (0 == g_hexagon_appcfg.enable_perf) { + return; + } + + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time); + //add following judgement will useful for other developers and AI experts although: + // it breaks the original logic + // it's not mandatory + // had to expose two public function in hexagon_profiler class + if (g_hexagon_profiler.profiler_get_frame_index() <= g_hexagon_profiler.profiler_get_threshold_count()) { + const char * devname = ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend); + //the logic here is make sense because already checked in ggml_backend_hexagon_device_init_backend + if (g_hexagon_appcfg.hexagon_backend != HEXAGON_BACKEND_GGML) { + devname += 16; + } + GGMLHEXAGON_LOG_VERBOSE("inference duration of %s through %s: %lld microseconds", + _perf_name.c_str(), devname, _duration); + } + + //update profiler data + g_hexagon_profiler.profiler_update_profilerdata(_op_name, _duration, _input_size, _output_size); + } + +private: + hexagon_perf() = delete; + hexagon_perf(const hexagon_perf & ) = delete; + hexagon_perf(const hexagon_perf && ) = delete; + hexagon_perf & operator= (const hexagon_perf & ) = delete; + +private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; + const char * _op_name; + int _input_size = 0; + int _output_size = 0; +}; + +//a simple class to load/set running configurations in ggml-hexagon.cfg +class hexagon_appcfg { +public: + hexagon_appcfg() {} + + void dump(std::function worker) { + if (!_load_success) { + GGMLHEXAGON_LOG_INFO("qnn cfg file %s not loaded", _cfg_filename.c_str()); + return; + } + auto iter = _hexagon_appcfg.begin(); + while (iter != _hexagon_appcfg.end()) { + auto kv_iter = iter->second.begin(); + while (kv_iter != iter->second.end()) { + worker(iter->first, kv_iter->first, kv_iter->second); + ++kv_iter; + } + ++iter; + } + } + + bool load(const std::string & file_name) { + if (file_name == "") { + return false; + } + _cfg_filename = file_name; + std::ifstream in; + std::string line; + in.open(file_name.c_str()); + if (not in.is_open()) { + GGMLHEXAGON_LOG_WARN("can't open file %s", file_name.c_str()); + return false; + } + while (getline(in, line)) { + std::string section, key, value; + if (not parse_line(line, section, key, value)) { + continue; + } + set_section_keyvalue(section, key, value); + } + _load_success = true; + return true; + } + + void get_stringvalue(const std::string & section, const std::string & key, std::string & value, std::string default_value) { + value = default_value; + if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) { + return; + } + if (_hexagon_appcfg[section].find(key) == _hexagon_appcfg[section].end()) { + return; + } + value = _hexagon_appcfg[section][key]; + } + + void get_intvalue(const std::string & section, const std::string & key, int & value, int default_value) { + value = default_value; + if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) { + return; + } + if (_hexagon_appcfg[section].find(key) == _hexagon_appcfg[section].end()) { + return; + } + value = atol(_hexagon_appcfg[section][key].c_str()); + } + + bool modify_hexagon_config(std::string & cfg_filename, int new_hexagon_backend, int new_hwaccel_approach, int new_mulmat_algotype) { + std::ifstream inputfile(cfg_filename); + if (!inputfile.is_open()) { + GGMLHEXAGON_LOG_WARN("can't open file %s", cfg_filename.c_str()); + return false; + } + + std::string filedata = ""; + + std::string line; + std::string backupline; + bool is_rewrite = false; + bool is_founded = false; + bool is_key = true; + std::string key; + std::string value; + std::string newvalue; + while (std::getline(inputfile, line)) { + is_founded = false; + backupline = line; + trim(line); + if (0 == line.rfind("#", 0)) { + filedata += backupline; + filedata += "\n"; + continue; + } + + newvalue = ""; + if (line.rfind("hexagon_backend", 0) != std::string::npos) { + if (new_hexagon_backend >= 0) { + is_founded = true; + is_rewrite = true; + newvalue = std::to_string(new_hexagon_backend); + } + } + + if (line.rfind("hwaccel_approach", 0) != std::string::npos) { + //compatiable with previous logic + if (new_hwaccel_approach >= 0) { + is_founded = true; + is_rewrite = true; + newvalue = std::to_string(new_hwaccel_approach); + } + } + + if (line.rfind("mulmat_algotype", 0) != std::string::npos) { + //compatiable with previous logic + if (new_mulmat_algotype >= 0) { + is_founded = true; + is_rewrite = true; + newvalue = std::to_string(new_mulmat_algotype); + } + } + + + if (is_founded) { + is_key = true; + key = ""; + value = ""; + + for (size_t i = 0; i < line.size(); ++i) { + if (line[i] == '=') { + is_key = false; + continue; + } + if (is_key) { + key += line[i]; + } else { + value += line[i]; + } + } + trim(key); + trim(value); + GGMLHEXAGON_LOG_VERBOSE("key %s value %s\n", key.c_str(), value.c_str()); + GGMLHEXAGON_LOG_VERBOSE("key %s new value %s\n", key.c_str(), newvalue.c_str()); + backupline = key + " = " + newvalue; + } + filedata += backupline; + filedata += "\n"; + } + inputfile.close(); + + if (is_rewrite) { + std::ofstream outputfile; + outputfile.open(cfg_filename); + outputfile.flush(); + outputfile << filedata; + outputfile.close(); + } + return true; + } + + //compatiable with previous codes + bool modify_hexagon_config(std::string & cfg_filename, int new_hexagon_backend, int new_hwaccel_approach) { + return modify_hexagon_config(cfg_filename, new_hexagon_backend, new_hwaccel_approach, -1); + } + +private: + void ltrim(std::string & str) { + if (str.empty()) return; + size_t len = 0; + const char * temp = str.c_str(); + while (*temp && isblank(*temp)) { + ++len; + ++temp; + } + if (len > 0) str.erase(0, len); + } + + void rtrim(std::string & str) { + if (str.empty()) return; + size_t len = str.length(); + size_t pos = len; + while (pos > 0) { + if (not isblank(str[pos - 1])) { + break; + } + --pos; + } + if (pos != len) str.erase(pos); + } + + void trim(std::string & str) { + ltrim(str); + rtrim(str); + } + + void set_section_keyvalue(std::string & section, std::string & key, std::string & value) { + if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) { + std::unordered_map kv_map; + _hexagon_appcfg[section] = kv_map; + } + if (key != "" && value != "") _hexagon_appcfg[section][key] = value; + } + + bool parse_line(std::string & line, std::string & section, std::string & key, std::string & value) { + static std::string cur_section = ""; + std::string nodes[2] = {"#", ";"}; + for (int i = 0; i < 2; ++i) { + std::string::size_type pos = line.find(nodes[i]); + if (pos != std::string::npos) line.erase(pos); + } + trim(line); + if (line == "") return false; + if (line[0] == '[' && line[line.size() - 1] == ']') { + section = line.substr(1, line.size() - 2); + trim(section); + cur_section = section; + return false; + } + if (cur_section == "") return false; + bool is_key = true; + for (size_t i = 0; i < line.size(); ++i) { + if (line[i] == '=') { + is_key = false; + continue; + } + if (is_key) { + key += line[i]; + } else { + value += line[i]; + } + } + section = cur_section; + trim(key); + trim(value); + + //"1.00" -> 1.00 + if (value.front() == '"' && value.back() == '"') { + value.erase(0, 1); // erase the first character " + value.erase(value.size() - 1); // erase the last character " + } + + return true; + } + +private: + hexagon_appcfg(const hexagon_appcfg & ) = delete; + hexagon_appcfg(const hexagon_appcfg && ) = delete; + hexagon_appcfg & operator= (const hexagon_appcfg & ) = delete; + +private: + std::unordered_map> _hexagon_appcfg; + bool _load_success = false; + std::string _cfg_filename; +}; + +// ================================================================================================= +// section-3: helper function for WoA(Window on ARM) +// ================================================================================================= +#if !defined(__ANDROID__) && !defined(__linux__) +#define RTLD_GLOBAL 0x100 +#define RTLD_LOCAL 0x000 +#define RTLD_LAZY 0x000 +#define RTLD_NOW 0x001 +static void * dlopen(const char * filename, int flag); +static int dlclose(void * handle); +static void * dlsym(void* handle, const char* name); +static const char * dlerror(void); + +static const char * last_func = nullptr; +static long last_err; +static void * dlopen(const char * dll, int flags) { + HINSTANCE h = LoadLibraryA(dll); + GGML_UNUSED(flags); + if (h == NULL) { + last_err = GetLastError(); + last_func = "dlopen"; + } + return h; +} + +static int dlclose(void * h) { + if (!FreeLibrary((HINSTANCE)h)) { + last_err = GetLastError(); + last_func = "dlclose"; + return -1; + } + return 0; +} + +static void * dlsym(void * h, const char * name) { + FARPROC p = GetProcAddress((HINSTANCE)h, name); + if (!p) { + last_err = GetLastError(); + last_func = "dlsym"; + } + return (void*)(intptr_t)p; +} + +static const char * dlerror(void) { + static char str[512]; + if (!last_err) return nullptr; + + snprintf(str, 512, "%s error #%ld", last_func, last_err); + last_err = 0; + last_func = NULL; + + return str; +} +#endif + +// ================================================================================================= +// section-4: general helper function +// ================================================================================================= +static const char * ggmlhexagon_get_socmodel_desc(uint32_t soc_model) { + switch (soc_model) { + case SM7450: + return "SM7450"; + case SM8350: + return "SM8350"; + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + case SM8750: + return "SM8750"; + default: + return "unknown"; + } +} + +//0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79 +static size_t ggmlhexagon_htparch_hex_to_decimal(size_t htp_arch) { + //naive algorithm + int a = htp_arch / 16; + int b = htp_arch % 16; + return a * 10 + b; +} + +static const char * ggmlhexagon_get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + case V79: + return "QCOM_HTP_V79"; + default: + return "unknown"; + } +} + +static struct qcom_socinfo * ggmlhexagon_get_socinfo_from_socmodel(uint32_t soc_model) { + size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]); + for (size_t idx = 0; idx < items; idx++) { + if (soc_model == g_qnn_soc_info_table[idx].soc_model) { + return &g_qnn_soc_info_table[idx]; + } + } + return nullptr; +} + +static struct qcom_socinfo * ggmlhexagon_get_socinfo_from_socmodel(size_t htp_arch) { + size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]); + for (size_t idx = 0; idx < items; idx++) { + if (htp_arch == g_qnn_soc_info_table[idx].htp_arch) { + return &g_qnn_soc_info_table[idx]; + } + } + return nullptr; +} + +static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = ggml_get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + +static inline bool ggmlqnn_is_valid_params(ggml_backend_hexagon_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + if ((nullptr == ctx) || (nullptr == src0) || (nullptr == dst)) { + GGMLHEXAGON_LOG_WARN("invalid params\n"); + return false; + } + + qnn_instance * instance = ctx->instance; + if (nullptr == instance) { + GGMLHEXAGON_LOG_WARN("invalid params\n"); + return false; + } + + return true; +} + +static size_t ggmlhexagon_get_system_total_memory_in_bytes() { +#if defined(__ANDROID__) || defined(__linux__) + struct sysinfo info = {}; + if (0 == sysinfo(&info)) { + return (info.totalram + info.totalswap) * info.mem_unit; + } + size_t pages = (size_t)sysconf(_SC_PHYS_PAGES); + size_t page_size = (size_t)sysconf(_SC_PAGE_SIZE); + + return pages * page_size; +#else + //TODO: Snapdragon based WoA(Windows on ARM) + MEMORYSTATUSEX statex; + statex.dwLength = sizeof(statex); + if (GlobalMemoryStatusEx(&statex)) { + GGMLHEXAGON_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20); + GGMLHEXAGON_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20); + return statex.ullTotalPhys; + } + return 0; +#endif +} + +static size_t ggmlhexagon_get_system_free_memory_in_bytes() { +#if defined(__ANDROID__) || defined(__linux__) + struct sysinfo info = {}; + if (0 == sysinfo(&info)) { + return (info.freeram + info.freeswap) * info.mem_unit; + } + size_t avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); + size_t page_size = (size_t)sysconf(_SC_PAGE_SIZE); + + return avail_pages * page_size; +#else + //TODO: Snapdragon based WoA(Windows on ARM) + MEMORYSTATUSEX statex; + statex.dwLength = sizeof(statex); + if (GlobalMemoryStatusEx(&statex)) { + GGMLHEXAGON_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20); + GGMLHEXAGON_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20); + return statex.ullAvailPhys; + } + return 0; +#endif +} + +static bool ggmlhexagon_same_types(const ggml_backend_hexagon_context * ctx, const ggml_tensor * op_tensor) { + GGML_UNUSED(ctx); + ggml_tensor * src0 = op_tensor->src[0]; + ggml_tensor * src1 = op_tensor->src[1]; + if (nullptr != src1) { + if (src0->type != op_tensor->type || src1->type != op_tensor->type) { + return false; + } + } else { + if (src0->type != op_tensor->type) { + return false; + } + } + + if (src0->type != GGML_TYPE_F32) + return false; + + return true; +} + +static const char * ggmlhexagon_get_ggml_type_name(ggml_type type) { + const auto * traits = ggml_get_type_traits(type); + return traits->type_name; +} + +static void ggmlhexagon_append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) { + char buffer[GGMLHEXAGON_TMPBUF_LEN] = {}; + const char * type_name = ggmlhexagon_get_ggml_type_name(tensor->type); + int len = 0; + switch (ggml_n_dims(tensor)) { + case 1: + len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name); + break; + case 2: + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name); + break; + case 3: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], type_name); + break; + case 4: + default: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3], type_name); + break; + } + GGML_ASSERT(len > 0 && len < (int)sizeof(buffer)); + output.append(buffer, len); +} + +static size_t ggmlhexagon_get_op_index(const ggml_tensor * tensor) { + if (tensor->op == GGML_OP_UNARY) { + return static_cast(GGML_OP_COUNT) + static_cast(ggml_get_unary_op(tensor)); + } + + return tensor->op; +} + +static size_t ggmlhexagon_get_op_input_param_count(const ggml_tensor * op) { + auto op_index = ggmlhexagon_get_op_index(op); + GGML_ASSERT(op_index < std::size(ggmlqnn_k_op_caps)); + return ggmlhexagon_k_op_caps[op_index].input_param_count; +} + +static void ggmlhexagon_get_opkey_from_op(const ggml_tensor * op, std::string & output) { + GGML_ASSERT(op->op != GGML_OP_NONE); + output += ggml_op_desc(op); + output += ggmlhexagon_get_ggml_type_name(op->type); + size_t param_count = ggmlhexagon_get_op_input_param_count(op); + for (size_t i = 0; i < param_count; ++i) { + auto * input = op->src[i]; + if (!input) { + break; + } + output += '_'; + ggmlhexagon_append_tensor_dimensions(input, output); + } +} + +static void * ggmlhexagon_type_trait(ggml_backend_hexagon_context * ctx, ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + const enum ggml_type src0_type = src0->type; + + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(nb00 == ggml_type_size(src0_type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + const int64_t ne_plane = ne01 * ne00; + const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float)); + ctx->desired_size = desired_size; + if (ctx->work_size < desired_size) { + ctx->work_data.reset(new char[desired_size]); + ctx->work_size = desired_size; + } + ctx->n_threads = std::thread::hardware_concurrency(); + void * wdata = ctx->work_data.get(); + // convert src0 to float + if (src0_type != GGML_TYPE_F32) { + const auto * type_traits = ggml_get_type_traits(src0_type); + ggml_to_float_t const to_float = type_traits->to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *)src0->data + i02 * nb02 + i03 * nb03; + float * const wplane = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1); + for (int i = 1; i < n_threads; i++) { + const int64_t start = i * ne01 / n_threads; + const int64_t end = (i + 1) * ne01 / n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01 / n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + } + } + } + + // wait for all tasks to finish + for (auto &task: ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); + } + return wdata; +} + +static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path) { +#if defined(__ANDROID__) + if ((HEXAGON_BACKEND_QNNNPU == device) || (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach)) { + std::string lib_runtime_path = path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images"; + if (0 == setenv("LD_LIBRARY_PATH", lib_runtime_path.c_str(), 1)) { + GGMLHEXAGON_LOG_DEBUG("setenv LD_LIBRARY_PATH %s successfully", lib_runtime_path.c_str()); + } else { + GGMLHEXAGON_LOG_ERROR("setenv LD_LIBRARY_PATH %s failure", lib_runtime_path.c_str()); + } + + std::string adsp_runtime_path = path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp"; + if (0 == setenv("ADSP_LIBRARY_PATH", adsp_runtime_path.c_str(), 1)) { + GGMLHEXAGON_LOG_DEBUG("setenv ADSP_LIBRARY_PATH %s successfully", adsp_runtime_path.c_str()); + } else { + GGMLHEXAGON_LOG_ERROR("setenv ADSP_LIBRARY_PATH %s failure", adsp_runtime_path.c_str()); + } + } else { + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + GGMLHEXAGON_LOG_DEBUG("%s backend setenv successfully\n", + ggml_backend_hexagon_get_devname(device)); + } else { + GGMLHEXAGON_LOG_ERROR("%s backend setenv failure\n", + ggml_backend_hexagon_get_devname(device)); + } + } +#endif +} + +static void ggmlhexagon_load_cfg() { + //this function can be called in various scenarios + static bool initialized = false; + if (initialized) { + GGMLHEXAGON_LOG_DEBUG("hexagon appcfg file already loaded\n"); + return; + } + char time_string[GGMLHEXAGON_TMPBUF_LEN]; + memset(time_string, 0, GGMLHEXAGON_TMPBUF_LEN); + ggmlhexagon_get_timestring(time_string); + GGMLHEXAGON_LOG_DEBUG("program running start time:%s", time_string); + std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); + + hexagon_appcfg hexagoncfg_instance; + hexagoncfg_instance.load(cfg_filename); + hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) { + std::ostringstream tmposs; + tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]"; + GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str()); + }); + std::string precision_mode; + std::string version; //version of ggml-hexagon.cpp + std::string ggmldsp_version; //version of ggml-dsp.c + hexagoncfg_instance.get_stringvalue("general", "version", version, "1.00"); + hexagoncfg_instance.get_stringvalue("general", "ggmldsp_version", ggmldsp_version, "0.62"); + hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1); + hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0); + hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0); + hexagoncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, HWACCEL_CDSP); + hexagoncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, HEXAGON_BACKEND_CDSP); + hexagoncfg_instance.get_intvalue("general", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0); + hexagoncfg_instance.get_intvalue("general", "enable_profiler", g_hexagon_appcfg.enable_profiler, 0); + hexagoncfg_instance.get_intvalue("general", "profiler_duration", g_hexagon_appcfg.profiler_duration, 5); + hexagoncfg_instance.get_intvalue("general", "profiler_counts", g_hexagon_appcfg.profiler_counts, 100); + hexagoncfg_instance.get_intvalue("general", "enable_pinned_memory", g_hexagon_appcfg.enable_pinned_memory, 0); + + hexagoncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4); + hexagoncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8); + hexagoncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 1); + hexagoncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32"); + hexagoncfg_instance.get_intvalue("qnn", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0); + + hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0); + hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0); + hexagoncfg_instance.get_intvalue("cdsp", "thread_counts", g_hexagon_appcfg.thread_counts, 4); + hexagoncfg_instance.get_intvalue("cdsp", "mulmat_algotype", g_hexagon_appcfg.mulmat_algotype, 0); + + memcpy(g_hexagon_appcfg.ggml_dsp_version, ggmldsp_version.c_str(), strlen(ggmldsp_version.c_str())); + + GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str()); + GGMLHEXAGON_LOG_VERBOSE("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version); + GGMLHEXAGON_LOG_VERBOSE("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version); + GGMLHEXAGON_LOG_VERBOSE("external ggml_hexagon_version=%s", version.c_str()); + GGMLHEXAGON_LOG_VERBOSE("external ggml_dsp_version=%s", ggmldsp_version.c_str()); + GGMLHEXAGON_LOG_VERBOSE("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach, + ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); + GGMLHEXAGON_LOG_VERBOSE("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend, + ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend)); + GGMLHEXAGON_LOG_VERBOSE("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath); + GGMLHEXAGON_LOG_VERBOSE("enable_perf=%d", g_hexagon_appcfg.enable_perf); + GGMLHEXAGON_LOG_VERBOSE("enable_profiler=%d", g_hexagon_appcfg.enable_profiler); + + if (precision_mode.find("fp16") != std::string::npos) { + g_hexagon_appcfg.precision_mode = 1; + } else { + g_hexagon_appcfg.precision_mode = 0; + } + + ggmlhexagon_set_runtime_path(HEXAGON_BACKEND_CDSP, g_hexagon_appcfg.runtime_libpath); + + if (1 == g_hexagon_appcfg.enable_profiler) { + //make sure this function is called only once + g_hexagon_profiler.profiler_init(g_hexagon_appcfg.profiler_duration, g_hexagon_appcfg.profiler_counts); + } + + initialized = true; +} + +void ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach) { + if (new_hexagon_backend < 0 || new_hexagon_backend > HEXAGON_BACKEND_GGML) { + GGMLHEXAGON_LOG_WARN("invalid new_hexagon_backend"); + return; + } + if (new_hwaccel_approach < 0 || new_hwaccel_approach > HWACCEL_CDSP) { + GGMLHEXAGON_LOG_WARN("invalid new_hwaccel_approach"); + return; + } + std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); + GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str()); + hexagon_appcfg hexagoncfg_instance; + GGMLHEXAGON_LOG_VERBOSE("set_hexagon_cfg with new_hexagon_backend %d, new_hwaccel_approach %d", new_hexagon_backend, new_hwaccel_approach); + hexagoncfg_instance.modify_hexagon_config(cfg_filename, new_hexagon_backend, new_hwaccel_approach); + hexagoncfg_instance.load(cfg_filename); + hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) { + std::ostringstream tmposs; + tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]"; + GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str()); + }); +} + +int ggml_backend_hexagon_get_mulmat_algotype() { + std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); + hexagon_appcfg hexagoncfg_instance; + hexagoncfg_instance.load(cfg_filename); + hexagoncfg_instance.get_intvalue("cdsp", "mulmat_algotype", g_hexagon_appcfg.mulmat_algotype, 0); + return g_hexagon_appcfg.mulmat_algotype; +} + +/** + * troubleshooting peformance of mulmat on cDSP during development stage + */ +void ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype) { + //the logic here is different with logic in the ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach) + if (new_mulmat_algotype < 0) { + GGMLHEXAGON_LOG_WARN("invalid new_mulmat_algotype"); + return; + } + std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); + GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str()); + hexagon_appcfg hexagoncfg_instance; + GGMLHEXAGON_LOG_VERBOSE("set_hexagon_cfg with new_mulmat_algotype %d", new_mulmat_algotype); + hexagoncfg_instance.modify_hexagon_config(cfg_filename, -1, -1, new_mulmat_algotype); + hexagoncfg_instance.load(cfg_filename); + hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) { + std::ostringstream tmposs; + tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]"; + GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str()); + }); +} + +static bool ggmlhexagon_check_valid_appcfg() { + bool is_valid_appcfg = true; + + GGMLHEXAGON_LOG_DEBUG("user's specified hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach, + ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); + GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend); + if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) { + GGMLHEXAGON_LOG_VERBOSE("using default ggml backend"); + is_valid_appcfg = false; + } + + if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) { + GGMLHEXAGON_LOG_VERBOSE("HWACCEL_QNN_SINGLEGRAPH not supported"); + is_valid_appcfg = false; + } + + if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach) { + if (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend) { + GGMLHEXAGON_LOG_VERBOSE("hexagon_backend HEXAGON_BACKEND_CDSP must match with hwaccel_approach HWACCEL_CDSP"); + is_valid_appcfg = false; + } + } + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + if ((HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend) && (HEXAGON_BACKEND_GGML != g_hexagon_appcfg.hexagon_backend)) { + GGMLHEXAGON_LOG_VERBOSE("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP"); + is_valid_appcfg = false; + } + + if (1 == g_hexagon_appcfg.enable_all_q_mulmat) { + if (0 == g_hexagon_appcfg.enable_q_mulmat) { + GGMLHEXAGON_LOG_DEBUG("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1 if you are not currently comparing the performance of GGML_OP_ADD between QNNCPU, QNNGPU, QNNNPU, cDSP, ggml"); + //is_valid_appcfg = false; + } + } + } + + if (!is_valid_appcfg) { + GGMLHEXAGON_LOG_VERBOSE("it seems there is non-default configuration in ggml-hexagon.cfg, will using the default ggml backend accordingly"); + } + return is_valid_appcfg; +} + +static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx); +static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * ctx) { + char timestamp[GGMLHEXAGON_TMPBUF_LEN]; + memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN); + + if (ggmlhexagon_is_llamabench_running()) { + //make llama-bench happy + return; + } + + GGMLHEXAGON_LOG_INFO("ggml_hexagon_version: %s", g_hexagon_appcfg.ggml_hexagon_version); + GGMLHEXAGON_LOG_INFO("ggml_dsp_version: %s", g_hexagon_appcfg.ggml_dsp_version); + GGMLHEXAGON_LOG_INFO("hwaccel approach: %d(%s)", g_hexagon_appcfg.hwaccel_approach, + ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); + GGMLHEXAGON_LOG_INFO("hexagon_backend: %d(%s)", g_hexagon_appcfg.hexagon_backend, + ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend)); + GGMLHEXAGON_LOG_INFO("enable pinned_memory: %s", g_hexagon_appcfg.enable_pinned_memory ? "YES" : "NO"); + ggmlhexagon_get_timestring(timestamp); + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO"); + GGMLHEXAGON_LOG_INFO("using rpc ion memory pool: %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO"); + GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_CDSP: %d", g_hexagon_appcfg.thread_counts); + GGMLHEXAGON_LOG_INFO("mulmat algo type on cDSP : %d", g_hexagon_appcfg.mulmat_algotype); + ggmlhexagon_probe_dspinfo(ctx); + } else { + GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_QNN: %d", g_hexagon_appcfg.hvx_threads); + GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO"); + } + GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp); + + if (1 == g_hexagon_appcfg.enable_profiler) { + //make sure this function is called only once + g_hexagon_profiler.profiler_deinit(); + } +} + +// ================================================================================================= +// section-5: QNN helper function/class +// ================================================================================================= +//make sure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment +static void ggmlqnn_reset_idx() { + g_qnntensor_idx = 0; + g_qnnopcfg_idx = 0; +} + +static void ggmlqnn_inc_idx(int idx_type) { + switch (idx_type) { + case QNN_TENSOR_INDEX: + g_qnntensor_idx++; + break; + case QNN_OPCFG_INDEX: + g_qnnopcfg_idx++; + break; + default: + break; + } +} + +static int32_t ggmlqnn_get_idx(int idx_type) { + switch (idx_type) { + case QNN_TENSOR_INDEX: + return g_qnntensor_idx; + case QNN_OPCFG_INDEX: + return g_qnnopcfg_idx; + default: + break; + } + + //it's not make sense, just for fix compiler warning + return g_qnntensor_idx; +} + +static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + +static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { + if (!dst || !src || !dst_size || !copy_size) + return 0; + + size_t min_size = dst_size < copy_size ? dst_size : copy_size; + + memcpy(dst, src, min_size); + + return min_size; +} + +static char * ggmlqnn_strndup(const char * source, size_t maxlen) { +#if defined(__ANDROID__) || defined(__linux__) + return strndup(source, maxlen); +#else + //TODO:behaviour is not exactly same to Android&Linux + GGML_UNUSED(maxlen); + return strdup(source); +#endif +} + +static inline uint32_t ggmlqnn_get_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + return 0u; +} + +static inline const char * ggmlqnn_get_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + +static inline Qnn_TensorType_t ggmlqnn_get_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + +static inline Qnn_TensorDataFormat_t ggmlqnn_get_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + +static inline Qnn_DataType_t ggmlqnn_get_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + +static inline Qnn_QuantizeParams_t ggmlqnn_get_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + +static inline uint32_t ggmlqnn_get_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + +static inline uint32_t * ggmlqnn_get_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + +static inline Qnn_TensorMemType_t ggmlqnn_get_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + +static inline void ggmlqnn_set_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + +static inline void ggmlqnn_set_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + +static inline void ggmlqnn_set_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } +} + +static inline void ggmlqnn_set_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } +} + +static inline void ggmlqnn_set_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } +} + +static inline void ggmlqnn_set_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } +} + +static inline void ggmlqnn_set_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } +} + +static inline void ggmlqnn_set_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } +} + +static inline void ggmlqnn_set_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = memType; + } +} + +static inline void ggmlqnn_set_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = clientBuf; + } +} + +static inline void ggmlqnn_set_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} + +static int ggmlqnn_deep_copy_qnntensor(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { + int err = 0; + + dst.version = src.version; + ggmlqnn_set_tensor_name(dst, ggmlqnn_strndup(ggmlqnn_get_tensorname(src), std::string(ggmlqnn_get_tensorname(src)).size())); + if (nullptr == ggmlqnn_get_tensorname(dst)) { + return 1; + } + ggmlqnn_set_tensor_id(dst, ggmlqnn_get_tensorid(src)); + ggmlqnn_set_tensor_type(dst, ggmlqnn_get_tensortype(src)); + ggmlqnn_set_tensor_dataformat(dst, ggmlqnn_get_tensor_dataformat(src)); + ggmlqnn_set_tensor_datatype(dst, ggmlqnn_get_tensor_datatype(src)); + ggmlqnn_set_tensor_memtype(dst, ggmlqnn_get_tensor_memtype(src)); + + if (ggmlqnn_get_tensor_memtype(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t client_buf = {nullptr, 0}; + ggmlqnn_set_tensor_clientbuf(dst, client_buf); + } else if (ggmlqnn_get_tensor_memtype(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + ggmlqnn_set_tensor_memhandle(dst, nullptr); + } else { + return 1; + } + + Qnn_QuantizeParams_t src_qparam = ggmlqnn_get_tensor_quantparams(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t ** scale_offset = &axis_scale_offset.scaleOffset; + size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scale_offset = (Qnn_ScaleOffset_t *)malloc(scale_offset_size); + ggmlqnn_memscpy(*scale_offset, + scale_offset_size, + src_qparam.axisScaleOffsetEncoding.scaleOffset, + scale_offset_size); + ggmlqnn_set_tensor_quantparams(dst, src_qparam_cpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scale_size = bwaxis_scale_offset.numElements * sizeof(float); + float ** scales = &bwaxis_scale_offset.scales; + int32_t ** offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scale_size); + ggmlqnn_memscpy(*scales, scale_size, src_qparam.bwAxisScaleOffsetEncoding.scales, scale_size); + + if (bwaxis_scale_offset.offsets != nullptr) { + size_t offset_size = bwaxis_scale_offset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offset_size); + ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size); + } + ggmlqnn_set_tensor_quantparams(dst, src_qparam_cpy); + } else { + ggmlqnn_set_tensor_quantparams(dst, src_qparam); + } + + uint32_t rank = ggmlqnn_get_tensor_rank(src); + ggmlqnn_set_tensor_rank(dst, rank); + size_t dim_size = GGML_MAX_DIMS * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *)malloc(dim_size); + if (nullptr == dimensions) { + GGMLHEXAGON_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", ggmlqnn_get_tensorname(src)); + return 1; + } + ggmlqnn_memscpy(dimensions, dim_size, ggmlqnn_get_tensor_dimensions(src), dim_size); + ggmlqnn_set_tensor_dimensions(dst, dimensions); + + return err; +} + +static int ggmlqnn_free_qnntensor(Qnn_Tensor_t * tensor) { + int err = 0; + free((void *) ggmlqnn_get_tensorname(*tensor)); + Qnn_QuantizeParams_t src_qparam = ggmlqnn_get_tensor_quantparams(*tensor); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + free(src_qparam.axisScaleOffsetEncoding.scaleOffset); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + free(src_qparam.bwAxisScaleOffsetEncoding.scales); + if (src_qparam.bwAxisScaleOffsetEncoding.offsets != nullptr) { + free(src_qparam.bwAxisScaleOffsetEncoding.offsets); + } + } + GGMLHEXAGON_LOG_DEBUG("free tensor %p", tensor); + free(ggmlqnn_get_tensor_dimensions(*tensor)); + free(tensor); + + return err; +} + +static const char * ggmlqnn_get_qnnerror_string(Qnn_ErrorHandle_t qnn_error_code) { + // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html + switch (qnn_error_code) { + case QNN_SUCCESS: + return "QNN_SUCCESS"; + case QNN_COMMON_ERROR_GENERAL: + return "QNN_COMMON_ERROR_GENERAL"; + + // QnnGraph_Error_t + case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: + return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE"; + case QNN_GRAPH_ERROR_MEM_ALLOC: + return "QNN_GRAPH_ERROR_MEM_ALLOC"; + case QNN_GRAPH_ERROR_INVALID_ARGUMENT: + return "QNN_GRAPH_ERROR_INVALID_ARGUMENT"; + case QNN_GRAPH_ERROR_INVALID_HANDLE: + return "QNN_GRAPH_ERROR_INVALID_HANDLE"; + case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST: + return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST"; + case QNN_GRAPH_ERROR_INVALID_NAME: + return "QNN_GRAPH_ERROR_INVALID_NAME"; + case QNN_GRAPH_ERROR_INVALID_TENSOR: + return "QNN_GRAPH_ERROR_INVALID_TENSOR"; + case QNN_GRAPH_ERROR_INVALID_OP_CONFIG: + return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG"; + case QNN_GRAPH_ERROR_SET_PROFILE: + return "QNN_GRAPH_ERROR_SET_PROFILE"; + case QNN_GRAPH_ERROR_UNCONNECTED_NODE: + return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; + case QNN_GRAPH_ERROR_CREATE_FAILED: + return "QNN_GRAPH_ERROR_CREATE_FAILED"; + case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED: + return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED"; + case QNN_GRAPH_ERROR_FINALIZE_FAILED: + return "QNN_GRAPH_ERROR_FINALIZE_FAILED"; + case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED"; + case QNN_GRAPH_ERROR_GRAPH_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_FINALIZED"; + case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL: + return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL"; + case QNN_GRAPH_ERROR_SIGNAL_IN_USE: + return "QNN_GRAPH_ERROR_SIGNAL_IN_USE"; + case QNN_GRAPH_ERROR_ABORTED: + return "QNN_GRAPH_ERROR_ABORTED"; + case QNN_GRAPH_ERROR_PROFILE_IN_USE: + return "QNN_GRAPH_ERROR_PROFILE_IN_USE"; + case QNN_GRAPH_ERROR_TIMED_OUT: + return "QNN_GRAPH_ERROR_TIMED_OUT"; + case QNN_GRAPH_ERROR_SUBGRAPH: + return "QNN_GRAPH_ERROR_SUBGRAPH"; + case QNN_GRAPH_ERROR_DISABLED: + return "QNN_GRAPH_ERROR_DISABLED"; + case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: + return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE"; + case QNN_GRAPH_ERROR_TENSOR_SPARSITY: + return "QNN_GRAPH_ERROR_TENSOR_SPARSITY"; + case QNN_GRAPH_ERROR_EARLY_TERMINATION: + return "QNN_GRAPH_ERROR_EARLY_TERMINATION"; + case QNN_GRAPH_ERROR_INVALID_CONTEXT: + return "QNN_GRAPH_ERROR_INVALID_CONTEXT"; + + //QQnnTensor_Error_t + //Invalid context/graph handle in creating tensor + case QNN_TENSOR_ERROR_INVALID_HANDLE: + return "QNN_TENSOR_ERROR_INVALID_HANDLE"; + //Tensor with specified credentials not registered with a context/graph + case QNN_TENSOR_ERROR_DOES_NOT_EXIST: + return "QNN_TENSOR_ERROR_DOES_NOT_EXIST"; + // (deprecated) Tensor has already been registered with backend + case QNN_TENSOR_ERROR_ALREADY_EXISTS: + return "QNN_TENSOR_ERROR_ALREADY_EXISTS"; + // Invalid tensor param. + case QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: + return "QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM"; + // This tensor param is currently unsupported + case QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: + return "QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM"; + // Tensor provided for update is invalid + case QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE: + return "QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE"; + + // QnnOpPackage_Error_t + case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: + return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFO: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFO"; + case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: + return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: + return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; + + default: + return "unknown QNN error"; + } +} + +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; +} + +static void ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) { + if (rank > GGML_MAX_DIMS) { + GGMLHEXAGON_LOG_WARN("invalid params"); + return; + } + if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) { + GGMLHEXAGON_LOG_WARN("invalid params"); + return; + } + for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++) + qnn_dimensions[idx] = ggml_dimensions[idx]; + + if (rank >= 2) { + qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2]; + qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1]; + } +} + +template +Fn ggmlqnn_load_qnn_functionpointers(void * handle, const char * function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} + +class qnn_interface { +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + +public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree) + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage) + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig) + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion) + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree) + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure) + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo) + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo) + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize) + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary) + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary) + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree) + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode) + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize) + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute) + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve) + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree) + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel) + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents) + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents) + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData) + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree) + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister) + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister) + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability) + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor) + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor) + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate) + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo) + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree) + + void set_qnn_interface(const QnnInterface_t * qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { + return _qnn_interface->backendId; + } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + +private: + const QnnInterface_t * _qnn_interface = nullptr; + + const QnnSystemInterface_t * _qnn_sys_interface = nullptr; +}; + +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, + const std::string & model_name) : + _lib_path(std::move(lib_path)), + _backend_name(std::move(backend_name)), + _model_name(std::move(model_name)) {} + + ~qnn_instance() { + } + + int qnn_init(const QnnSaver_Config_t ** saver_config); + + int qnn_finalize(); + + const qnn_interface & get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + + Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + int init_qnn_graph(const char * graph_name, + bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr + ); + int init_qnn_graph(const std::string & graph_name, HEXAGONBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8); + + int finalize_qnn_graph(); + + bool is_valid_graph() const { return _qnn_graph_handle != nullptr; } + + int htp_init_perfinfra(); + + int htp_set_rpc_polling(); + + int htp_set_high_performance_mode(); + + std::string & get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { + return _rpcmem_initialized; + } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + size_t get_rpcmem_usage() { return _rpcmem_usage; } + + int32_t rpcmem_to_fd(void * buf); + + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); + Qnn_MemHandle_t register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type); + + void unregister_rpcmem(); + void unregister_rpcmem(Qnn_MemHandle_t mem_handle); + + void * alloc_rpcmem(size_t bytes, size_t alignment); + void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle); + + void free_rpcmem(void * buf); + void free_rpcmem(); + + bool is_rpcmem_allocated(void * buf); + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + + bool enable_qnn_rpc() { + return _enable_qnn_rpc; + } + + HEXAGONBackend get_device_id() { + return _device_id; + } + +private: + int load_system(); + + int unload_system(); + + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config); + + int unload_backend(); + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + + void * alloc_rpcmem_internal(size_t bytes, size_t alignment); + + void htp_probe_rpc_meminfo(); + + void htp_print_info(); + + void print_backend_info(); + + void htp_set_memory_grow_size(size_t size = 1ul * 1024 * 1024); + + void htp_enter_performance_mode(); + + void htp_set_n_hvx_threads(size_t n_threads); + +private: + static constexpr const int _required_num_providers = 1; + +private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // name of prebuilt QNN model, might be used in the future + BackendIdType _backend_id; + + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + qnn_profile_level _profile_level = PROFILE_OFF; + + void * _system_lib_handle = nullptr; + void * _loaded_lib_handle = nullptr; + const QnnInterface_t * _loaded_backend = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_htp_powerconfig_id = 1; + uint32_t _qnn_htp_device_id = 0; + uint32_t _qnn_htp_core_id = 0; + + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + qnn_interface _qnn_interface; + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_map _qnn_mem_set; + std::unordered_map _qnn_rpc_buffer_to_handles; + + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + std::unordered_map _rpcmem_usage_map; + size_t _rpcmem_usage = 0; // mempool usage in bytes + size_t _rpcmem_capacity = 0; // mempool size in bytes + + std::string _graph_name; + HEXAGONBackend _device_id; + void * _rpc_lib_handle = nullptr; + bool _enable_qnn_rpc = false; //TODO:unknown issue with QNN RPC feature + + qnn_instance(const qnn_instance &) = delete; + void operator=(const qnn_instance &) = delete; + + qnn_instance(qnn_instance &&) = delete; + void operator=(qnn_instance &&) = delete; +}; + +void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (nullptr == buf) { + GGMLHEXAGON_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(ggmlqnn_align_to(alignment, + reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + GGMLHEXAGON_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + return aligned_buf; +} + +void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { + if (_rpcmem_usage > (_rpcmem_capacity - (8 * SIZE_IN_MB))) { // reserve 8Mbytes in rpc mempool + GGMLHEXAGON_LOG_WARN("rpc mempool capacity: %d MiB, usage: %d MiB", _rpcmem_capacity / SIZE_IN_MB, _rpcmem_usage / SIZE_IN_MB); + return nullptr; + } + + auto aligned_buf = alloc_rpcmem_internal(bytes, alignment); + if (nullptr == aligned_buf) + return nullptr; + _rpcmem_usage_map.insert(std::pair(aligned_buf, bytes)); + + _rpcmem_usage += bytes; + return aligned_buf; +} + +void qnn_instance::free_rpcmem(void * buf) { + size_t rpcbuffer_size = 0; + if (!_rpcmem_initialized) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + GGMLHEXAGON_LOG_WARN("no allocated tensor\n"); + } else { + GGMLHEXAGON_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]); + for (const auto & [rpcbuffer, rpcbuffer_size] : _rpcmem_usage_map) { + if (buf == rpcbuffer) { + _rpcmem_usage -= rpcbuffer_size; + } + } + + if (rpcbuffer_size != 0) { + _rpcmem_usage_map.erase(buf); + } + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } +} + +void qnn_instance::free_rpcmem() { + if (_rpcmem_store_map.empty()) { + GGMLHEXAGON_LOG_WARN("no rpcmem allocated\n"); + return; + } + + for (const auto & [rpcbuffer, raw_rpcbuffer] : _rpcmem_store_map) { + GGMLHEXAGON_LOG_DEBUG("free rpc buffer %p", rpcbuffer); + _pfn_rpc_mem_free(rpcbuffer); + } + + _rpcmem_store_map.clear(); + _rpcmem_usage_map.clear(); + _rpcmem_usage = 0; +} + +int32_t qnn_instance::rpcmem_to_fd(void * buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; +} + +int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + GGMLHEXAGON_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + GGMLHEXAGON_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 3; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + GGMLHEXAGON_LOG_WARN("failed to get file descriptor\n"); + return 4; + } + GGMLHEXAGON_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register( + _qnn_context_handle, + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); + return 5; + } else { + GGMLHEXAGON_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert((std::pair(p_data, handle))); + + return 0; +} + +Qnn_MemHandle_t qnn_instance::register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type) { + if (!p_data) { + GGMLHEXAGON_LOG_WARN("invalid param"); + return nullptr; + } + + if (!is_rpcmem_initialized()) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized"); + return nullptr; + } + + if (is_rpcmem_registered(p_data)) { + GGMLHEXAGON_LOG_WARN("rpc memory already registered"); + return _qnn_rpc_buffer_to_handles[p_data]; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (mem_fd == -1) { + GGMLHEXAGON_LOG_WARN("failed to get file descriptor"); + return nullptr; + } + + GGMLHEXAGON_LOG_DEBUG("mem_fd %d", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {rank, dimensions, nullptr}, + data_type, QNN_MEM_TYPE_ION, + {{mem_fd}} + }; + Qnn_MemHandle_t handle = nullptr; + Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error)); + return nullptr; + } + + _qnn_rpc_buffer_to_handles.insert({p_data, handle}); + GGMLHEXAGON_LOG_DEBUG("successfully register shared memory handler: %p", handle); + return handle; +} + +void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (const auto & [ptr, handle] : _qnn_mem_set) { + if (mem_handle == handle) { + return ptr; + } + } + + GGMLHEXAGON_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; +} + +void qnn_instance::unregister_rpcmem() { + if (_qnn_mem_set.empty()) { + GGMLHEXAGON_LOG_WARN("no rpcmem registered\n"); + } + + for (const auto & [ptr, mem_handle] : _qnn_mem_set) { + auto error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } else { + GGMLHEXAGON_LOG_DEBUG("unregister shared memory ok"); + } + } + _qnn_mem_set.clear(); +} + +void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) { + Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); + } + + auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(), + [mem_handle](const auto &kv) { return kv.second == mem_handle; }); + if (it == _qnn_mem_set.end()) { + GGMLHEXAGON_LOG_WARN("failed to find shared memory handler: %p", mem_handle); + return; + } + + _qnn_mem_set.erase(it); +} + +bool qnn_instance::is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; +} + +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + GGMLHEXAGON_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + GGMLHEXAGON_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + auto get_providers = ggmlqnn_load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( + lib_handle, + "QnnInterface_getProviders"); + if (nullptr == get_providers) { + GGMLHEXAGON_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + std::uint32_t num_providers = 0; + const QnnInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + GGMLHEXAGON_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + GGMLHEXAGON_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + GGMLHEXAGON_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + GGMLHEXAGON_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + GGMLHEXAGON_LOG_VERBOSE("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _loaded_backend = provider_list[0]; + _loaded_lib_handle = lib_handle; + _backend_id = backend_id; + + auto saver_initialize = + ggmlqnn_load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize"); + if (nullptr != saver_initialize) { + error = saver_initialize(saver_config); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); + return 7; + } + } else { + GGMLHEXAGON_LOG_WARN("saver_initialize is null\n"); + } + + return 0; +} + +int qnn_instance::unload_backend() { + int dlclose_error = 0; + dlclose_error = dlclose(_loaded_lib_handle); + if (dlclose_error != 0) { + GGMLHEXAGON_LOG_WARN("failed to close QNN backend %d, error %s\n", _backend_id, dlerror()); + } + + return 0; +} + +int qnn_instance::load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + +#if !defined(__ANDROID__) && !defined(__linux__) + std::string system_lib_path = _lib_path + "QnnSystem.dll"; +#else + std::string system_lib_path = _lib_path + "libQnnSystem.so"; +#endif + GGMLHEXAGON_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + //re-try with default path of QNN binary runtime lib + _lib_path = std::string(g_hexagon_appcfg.runtime_libpath); +#if !defined(__ANDROID__) && !defined(__linux__) + system_lib_path = _lib_path + "QnnSystem.dll"; +#else + system_lib_path = _lib_path + "libQnnSystem.so"; +#endif + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; + } + } + + auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + _system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + GGMLHEXAGON_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + GGMLHEXAGON_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + GGMLHEXAGON_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + GGMLHEXAGON_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + GGMLHEXAGON_LOG_VERBOSE("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + GGMLHEXAGON_LOG_WARN("can not create QNN system contenxt\n"); + } else { + GGMLHEXAGON_LOG_VERBOSE("initialize qnn system successfully\n"); + } + + return 0; +} + +int qnn_instance::unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + GGMLHEXAGON_LOG_DEBUG("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + GGMLHEXAGON_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return result; +} + +static void ggmlqnn_sdk_logcallback(const char * fmt, + QnnLog_Level_t level, + uint64_t timestamp, + va_list argp) { + + if (0 == g_hexagon_appcfg.print_qnn_internal_log) + return; + + static std::mutex log_mutex; + static unsigned char s_ggmlqnn_sdk_logbuf[GGMLHEXAGON_LOGBUF_LEN]; + + const char * log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = " ERROR "; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = " INFO "; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = " DEBUG "; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + + double ms = (double) timestamp / 1000000.0; + { + std::lock_guard lock(log_mutex); + memset(s_ggmlqnn_sdk_logbuf, 0, GGMLHEXAGON_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggmlqnn_sdk_logbuf), GGMLHEXAGON_LOGBUF_LEN, fmt, argp); + GGMLHEXAGON_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf); + } +#if !GGMLHEXAGON_DEBUG + GGML_UNUSED(log_level_desc); + GGML_UNUSED(ms); +#endif +} + +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + GGMLHEXAGON_LOG_DEBUG("enter qni_init\n"); + + _device_id = HEXAGON_BACKEND_GGML; + if (_backend_name.find("QnnCpu") != std::string::npos) { + _device_id = HEXAGON_BACKEND_QNNCPU; + } + if (_backend_name.find("QnnGpu") != std::string::npos) { + _device_id = HEXAGON_BACKEND_QNNGPU; + } + if (_backend_name.find("QnnHtp") != std::string::npos) { + _device_id = HEXAGON_BACKEND_QNNNPU; + } + if (HEXAGON_BACKEND_GGML == _device_id) { + GGMLHEXAGON_LOG_INFO("user specified qnn backend is ggml, skip QNN initialize"); + return 0; + } + + if (0 != load_system()) { + GGMLHEXAGON_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + GGMLHEXAGON_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string backend_lib_path = _lib_path + _backend_name; + + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + GGMLHEXAGON_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + + _qnn_interface.set_qnn_interface(_loaded_backend); +#if 1 + _qnn_interface.qnn_log_create(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); +#else + _qnn_raw_interface.logCreate(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); +#endif + if (nullptr == _qnn_log_handle) { + GGMLHEXAGON_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone + return 3; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create(_qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + GGMLHEXAGON_LOG_WARN("why failed to initialize qnn backend\n"); + return 4; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnstatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnstatus) { + GGMLHEXAGON_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnstatus) { + GGMLHEXAGON_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnnstatus = QNN_SUCCESS; + if (_device_id == HEXAGON_BACKEND_QNNNPU) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + qcom_socinfo soc_info = {}; + qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + if (QNN_SUCCESS == qnnstatus) { + GGMLHEXAGON_LOG_VERBOSE("device counts %d\n", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { + GGMLHEXAGON_LOG_VERBOSE("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, + (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; + GGMLHEXAGON_LOG_VERBOSE("htp_type:%d(%s)\n", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {} }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + } else { + GGMLHEXAGON_LOG_WARN("failed to get platform info, are we in emulator?\n"); + soc_info = { NONE, UNKNOWN_SM, 0, {} }; + } + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = soc_info.soc_model; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + /* + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t)soc_info.htp_arch; + arch_customconfig.arch.deviceId = 0; + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + */ + const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, nullptr }; + qnnstatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnnstatus = _qnn_interface.qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) { + GGMLHEXAGON_LOG_WARN("failed to create QNN device\n"); + } else { + GGMLHEXAGON_LOG_VERBOSE("create device successfully\n"); + } + + if (PROFILE_OFF != _profile_level) { + GGMLHEXAGON_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (PROFILE_BASIC == _profile_level) { + GGMLHEXAGON_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + GGMLHEXAGON_LOG_WARN("unable to create profile handle in the backend\n"); + return 5; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (PROFILE_DETAIL == _profile_level) { + GGMLHEXAGON_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + GGMLHEXAGON_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + +#if defined(__ANDROID__) || defined(__linux__) + std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so"); + _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + GGMLHEXAGON_LOG_WARN("failed to load %s\n", full_path.c_str()); + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + } +#else + _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL); +#endif + if (nullptr == _rpc_lib_handle) { + GGMLHEXAGON_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 7; + } else { + GGMLHEXAGON_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { + GGMLHEXAGON_LOG_WARN("unable to access symbols in QNN RPC lib, dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 8; + } + + if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + GGMLHEXAGON_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno)); + return 9; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::string::npos) { + htp_print_info(); + htp_probe_rpc_meminfo(); + + if (0 != htp_init_perfinfra()) { + GGMLHEXAGON_LOG_WARN("initialize HTP performance failure"); + } + + htp_enter_performance_mode(); + htp_set_memory_grow_size(); + + if (enable_qnn_rpc()) { + GGMLHEXAGON_LOG_VERBOSE("NPU RPC feature enabled with QNN-NPU backend"); + } else { + GGMLHEXAGON_LOG_VERBOSE("NPU RPC feature disabled with QNN-NPU backend"); + } + } + + print_backend_info(); + + GGMLHEXAGON_LOG_DEBUG("leave qni_init\n"); + + return 0; +} + +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + GGMLHEXAGON_LOG_VERBOSE("enter %s\n", __func__); + ggmlqnn_reset_idx(); + + free_rpcmem(); + unregister_rpcmem(); + + if (nullptr != _pfn_rpc_mem_deinit) + _pfn_rpc_mem_deinit(); + + if (0 != dlclose(_rpc_lib_handle)) { + GGMLHEXAGON_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + GGMLHEXAGON_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + unload_system(); + + GGMLHEXAGON_LOG_VERBOSE("leave %s\n", __func__); + return ret_status; +} + +int qnn_instance::init_qnn_graph(const std::string & graph_name, HEXAGONBackend device, size_t vtcm_size_in_mb, size_t hvx_threads) { + _graph_name = graph_name; + _device_id = device; + + //GGMLHEXAGON_LOG_DEBUG("[%s][%s]created", ggml_backend_hexagon_get_devname(device), graph_name.c_str()); + + Qnn_ErrorHandle_t error = QNN_SUCCESS; + if (HEXAGON_BACKEND_QNNNPU == device) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = hvx_threads; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + if (0 == g_hexagon_appcfg.enable_dlbc) + dlbc_config.optimizationOption.floatValue = 0.0; // set to 0.0 to turn off DLBC + else + dlbc_config.optimizationOption.floatValue = 1.0; // set to 1.0 to turn on DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + std::vector graph_configs; + graph_configs.push_back(&graph_hvx_config); + graph_configs.push_back(&graph_dlbc_config); + graph_configs.push_back(&graph_vtcm_config); + graph_configs.push_back(&graph_opt_config); + if (1 == g_hexagon_appcfg.precision_mode) { + QnnHtpGraph_CustomConfig_t fp16_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + fp16_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION; + fp16_config.precision = QNN_PRECISION_FLOAT16; + QnnGraph_Config_t graph_fp16_config; + graph_fp16_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_fp16_config.customConfig = &fp16_config; + graph_configs.push_back(&graph_fp16_config); + } + graph_configs.push_back(nullptr); + error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs.data(), &_qnn_graph_handle); + //GGMLHEXAGON_LOG_DEBUG("[%s][%s]created graph %p", ggml_backend_hexagon_get_devname(device), graph_name.c_str(), _qnn_graph_handle); + } else { + error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &_qnn_graph_handle); + } + if (QNN_SUCCESS != error) { + GGMLHEXAGON_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", + ggml_backend_hexagon_get_devname(device), graph_name.c_str(), + ggmlqnn_get_qnnerror_string(error)); + return error; + } + + GGMLHEXAGON_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_hexagon_get_devname(device), graph_name.c_str()); + if (HEXAGON_BACKEND_QNNNPU == device) { + htp_set_n_hvx_threads(hvx_threads); + } + return QNN_SUCCESS; +} + +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { + Qnn_ErrorHandle_t result = 0; + + if (nullptr == graph_name) { + GGMLHEXAGON_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + GGMLHEXAGON_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + GGMLHEXAGON_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, + graph_name, + graph_configs, + &_qnn_graph_handle); + if (QNN_GRAPH_NO_ERROR != result || nullptr == _qnn_graph_handle) { + GGMLHEXAGON_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + GGMLHEXAGON_LOG_DEBUG("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; +} + +int qnn_instance::finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, nullptr) + != QNN_GRAPH_NO_ERROR) { + GGMLHEXAGON_LOG_WARN("finalizing graph failure\n"); + return 1; + } + } else { + GGMLHEXAGON_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; +} + +int qnn_instance::htp_init_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + Qnn_ErrorHandle_t error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (QNN_SUCCESS != error) { + GGMLHEXAGON_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + + QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_htp_powerconfig_id = power_configid; + //TODO:hardcode to 0 and 0 although it's correct + _qnn_htp_device_id = device_id; + _qnn_htp_core_id = core_id; + + return 0; +} + +void qnn_instance::htp_probe_rpc_meminfo() { + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4)); + if (nullptr == rpc_buffer) { + GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size * SIZE_IN_MB; + + free_rpcmem(); + _rpcmem_usage = 0; + GGMLHEXAGON_LOG_VERBOSE("capacity of rpc ion memory %d MiB\n", _rpcmem_capacity / SIZE_IN_MB); +} + +void qnn_instance::htp_print_info() { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + GGMLHEXAGON_LOG_DEBUG("HTP device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (size_t i = 0; i < p_info->v1.numHwDevices; i++) { + GGMLHEXAGON_LOG_DEBUG("HTP deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + GGMLHEXAGON_LOG_DEBUG("HTP_TYPE:%d(%s)", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN"); + GGMLHEXAGON_LOG_DEBUG("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MiB," \ + "dlbc_support:%d, signedpd_support:%d", \ + chipinfo.socModel, ggmlhexagon_get_socmodel_desc(chipinfo.socModel), \ + htp_arch, ggmlhexagon_get_htparch_desc(htp_arch), chipinfo.vtcmSize, \ + chipinfo.dlbcSupport, chipinfo.signedPdSupport); + struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(chipinfo.socModel); + g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}}; + if (nullptr != socinfo) { + memcpy(g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc)); + GGMLHEXAGON_LOG_DEBUG("soc info:%s", socinfo->soc_desc); + } else { + memcpy(g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo.soc_desc, "unknown", 7); + GGMLHEXAGON_LOG_DEBUG("soc info:unknown"); + } + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); +} + +void qnn_instance::print_backend_info() { + auto print_property = [&](const char * name, QnnProperty_Key_t property) { + auto ret = _qnn_raw_interface.propertyHasCapability(property); + + const char * status = "Unknown"; + if (ret == QNN_PROPERTY_SUPPORTED) { + status = "Yes"; + } else if (ret == QNN_PROPERTY_NOT_SUPPORTED) { + status = "No"; + } + + GGMLHEXAGON_LOG_VERBOSE("%s: %s", name, status); + }; + + GGMLHEXAGON_LOG_VERBOSE("QNN backend properties:"); + print_property("Create context from binary list", QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC); + print_property("Dynamic batch", QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE); + print_property("Early termination", QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION); + print_property("Dynamic dimensions", QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS); + print_property("Blockwise quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK); + print_property("Blockwise quantization with expansion", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION); + print_property("Vector quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VECTOR); + print_property("Tensor sparsity", QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY); + print_property("Updateable application tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_APP_TENSORS); + print_property("Updateable native tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_NATIVE_TENSORS); + print_property("Updateable static tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_STATIC_TENSORS); + print_property("Qnn group device", QNN_PROPERTY_GROUP_DEVICE); +} + +void qnn_instance::htp_set_memory_grow_size(size_t size) { + QnnHtpPerfInfrastructure_MemoryConfig_t grow_size_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE, + .memGrowSizeConfig = (uint32_t)size, + }; + + const QnnHtpPerfInfrastructure_MemoryConfig_t *memory_config[] = { + &grow_size_config, + nullptr, + }; + Qnn_ErrorHandle_t result = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config); + if (QNN_SUCCESS != result) { + GGMLHEXAGON_LOG_WARN("failed to set HTP memory config"); + } else { + GGMLHEXAGON_LOG_VERBOSE("succeed to set HTP memory config"); + } +} + +void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) { + QnnHtpGraph_CustomConfig_t htp_hvx_thread_config = { + .option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS, + .numHvxThreads = n_threads, + }; + + QnnGraph_Config_t hvx_thread_config = { + .option = QNN_GRAPH_CONFIG_OPTION_CUSTOM, + .customConfig = &htp_hvx_thread_config, + }; + + const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr}; + Qnn_ErrorHandle_t result = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs); + if (QNN_SUCCESS != result) { + GGMLHEXAGON_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads); + } else { + //GGMLHEXAGON_LOG_DEBUG("succeed to set QNN graph config: set hvx threads %d", n_threads); + } +} + +void qnn_instance::htp_enter_performance_mode() { + QnnHtpPerfInfrastructure_PowerConfig_t dcvs_v3_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3, + .dcvsV3Config = + { + .contextId = _qnn_htp_powerconfig_id, + + .setDcvsEnable = 1, + .dcvsEnable = 0, + + .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE, + + .setSleepLatency = 1, + .sleepLatency = 40, + + .setSleepDisable = 1, + .sleepDisable = 1, + + .setBusParams = 1, + .busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + .busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + .busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + + .setCoreParams = 1, + .coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + .coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + .coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + }, + }; + + QnnHtpPerfInfrastructure_PowerConfig_t hmx_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2, + .hmxV2Config = + { + .hmxPickDefault = 0, + .hmxVoltageCornerMin = DCVS_EXP_VCORNER_MAX, + .hmxVoltageCornerTarget = DCVS_EXP_VCORNER_MAX, + .hmxVoltageCornerMax = DCVS_EXP_VCORNER_MAX, + .hmxPerfMode = QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH, + }, + }; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_ctrl_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY, + .rpcControlLatencyConfig = 100, + }; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_poll_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME, + .rpcPollingTimeConfig = 9999, + }; + + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { + &dcvs_v3_config, + &hmx_config, + &rpc_ctrl_config, + &rpc_poll_config, + nullptr, + }; + Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs); + if (ret != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to set HTP power config"); + } else { + GGMLHEXAGON_LOG_VERBOSE("succeed to set HTP power config"); + } +} + +static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) { + if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) { + GGMLHEXAGON_LOG_WARN("invalid params\n"); + return nullptr; + } + + uint8_t * qnn_rpcbuffer = static_cast(instance->alloc_rpcmem(ggml_nbytes(ggml_tensor), 4)); + if (nullptr == qnn_rpcbuffer) { + GGMLHEXAGON_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + return nullptr; + } else { + GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer); + } + if (b_copydata) + memcpy(qnn_rpcbuffer, ggml_tensor->data, ggml_nbytes(ggml_tensor)); + instance->register_rpcmem(qnn_rpcbuffer, qnn_tensor); + return qnn_rpcbuffer; +} + +static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type, + Qnn_Param_t * params, uint32_t num_params, + Qnn_Tensor_t * inputs, uint32_t num_inputs, + Qnn_Tensor_t * outputs, uint32_t num_outputs) { + + char opcfg_name[GGML_MAX_NAME] = {}; + + //ensure the opcfg name is unique + if (nullptr == name) { + snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_idx(QNN_OPCFG_INDEX)); + } else { + snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX)); + } + //GGMLHEXAGON_LOG_DEBUG("create qnn opconfig %s", opcfg_name); + ggmlqnn_inc_idx(QNN_OPCFG_INDEX); + + Qnn_OpConfigV1_t v1 = {opcfg_name, package, type, + num_params, params, + num_inputs, inputs, + num_outputs, outputs + }; + Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}}; + + return opcfg; +} + +static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle, + const ggml_tensor * tensor, const char * name, + Qnn_TensorType_t qnn_tensor_type, + Qnn_DataType_t qnn_data_type, + uint32_t rank, uint32_t * dims, + void * data, uint32_t data_size, + bool b_transpose = false) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + char tensor_name[GGML_MAX_NAME] = {}; + + //ensure the tensor name is unique + if (nullptr == name) { + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmlqnn_get_idx(QNN_TENSOR_INDEX)); + } else { + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmlqnn_get_idx(QNN_TENSOR_INDEX)); + } + GGMLHEXAGON_LOG_DEBUG("init_tensor %s", tensor_name); + ggmlqnn_inc_idx(QNN_TENSOR_INDEX); + + uint32_t reverse_dims[GGML_MAX_DIMS] = {}; + uint32_t transpose_dims[GGML_MAX_DIMS] = {}; + uint32_t * tensor_dims = nullptr; + //case 1:use dims info from ggml tensor + if (nullptr != tensor) { + //there are different dimension order between ggml tensor and qnn tensor + for (size_t idx = 0; idx < rank; idx++) { + reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx]; + } + tensor_dims = reverse_dims; + } + //case 2: use user's specified tensor_dims + if (nullptr != dims) { + tensor_dims = dims; + } + //case 3: transpose for dst tensor + if (b_transpose) { + GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case + + ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor)); + tensor_dims = transpose_dims; + } + + Qnn_Tensor_t qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + .v1 = { + .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED, + .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED, + .scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0} + }, + .rank = rank, + .dimensions = tensor_dims, + .memType = QNN_TENSORMEMTYPE_RAW, + .clientBuf = {.data = nullptr, .dataSize = 0} + } + }; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + if (nullptr == p_qnn_tensor) { + GGMLHEXAGON_LOG_WARN("calloc failed"); + return nullptr; + } + error = ggmlqnn_deep_copy_qnntensor(qnn_tensor, *p_qnn_tensor); + if (error != QNN_SUCCESS) { + free(p_qnn_tensor); + GGMLHEXAGON_LOG_WARN("init tensor failed"); + return nullptr; + } + + bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == HEXAGON_BACKEND_QNNNPU); + if (enable_npu_rpc) { + QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0}; + } else { + QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size}; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = instance->get_qnn_raw_interface(); + CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor)); + + return p_qnn_tensor; +} + +static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle, + const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) { + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], + (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + if (0 == tensor->flags) { + qnn_tensor_type = tensor_type; + } else { + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + } + + qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type); + Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, tensor, nullptr, + qnn_tensor_type, qnn_data_type, + ggml_n_dims(tensor), dimensions, + nullptr, 0); + return p_qnn_tensor; +} + +// ================================================================================================= +// section-6: hwaccel approach through QNN: offload GGML op to QNN backend +// ================================================================================================= +/* + * provide a general skeleton to offload ggml op to QNN backend: perform element-wise + * operation on 1/2 input tensors and 1 output tensors +*/ +static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml_tensor * op) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + qnn_instance * instance = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * p_tensor0 = nullptr; + Qnn_Tensor_t * p_tensor1 = nullptr; + Qnn_Tensor_t * p_tensor2 = nullptr; + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + size_t qnn_op_index = ggmlhexagon_get_op_index(op); + const char * qnn_op_name = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name; + size_t input_param_count = ggmlqnn_k_op_caps[qnn_op_index].input_param_count; + const char * ggml_original_opname = ggml_op_name(op->op); + std::string ggml_op_name_string = std::string("ggml_") + ggml_original_opname; + const char * ggml_op_name = ggml_op_name_string.c_str(); + + std::string graph_name; + ggmlhexagon_get_opkey_from_op(op, graph_name); + + int input_size = ggml_nbytes(src0); + if (nullptr != src1) + input_size += ggml_nbytes(src1); + hexagon_perf op_perf(graph_name, ggml_original_opname, input_size, ggml_nbytes(dst)); + op_perf.start(); + + bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == HEXAGON_BACKEND_QNNNPU; + if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) { + //retrieve computational resource from cached QNN graph + qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name]; + graph_handle = std::get<0>(graph_item); + qnn_ptensors_t & ptensors = std::get<1>(graph_item); + p_tensor0 = ptensors[0]; + if (2 == input_param_count) { + p_tensor1 = ptensors[1]; + p_tensor2 = ptensors[2]; + } else { + //now p_tensor1 is nullptr + p_tensor2 = ptensors[1]; + } + } else { + GGML_ASSERT(instance->get_device_id() == ctx->device); + GGMLHEXAGON_LOG_VERBOSE("graph name %s", graph_name.c_str()); + //create QNN graph + error = instance->init_qnn_graph(graph_name, static_cast(ctx->device), + g_hexagon_appcfg.vtcm_size_in_mb, + g_hexagon_appcfg.hvx_threads); + if (QNN_SUCCESS != error) { + GGMLHEXAGON_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + graph_handle = instance->get_qnn_graph_handle(); + + //GGMLHEXAGON_LOG_DEBUG("graph_handle %p", graph_handle); + //create computational tensor + p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE); + if (2 == input_param_count) { + p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE); + } + p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst, QNN_TENSOR_TYPE_APP_READ); + + //compose QNN graph + qnn_tensors_t input_tensors; + input_tensors.reserve(input_param_count); + input_tensors.push_back(*p_tensor0); + if (2 == input_param_count) { + input_tensors.push_back(*p_tensor1); + } + Qnn_Tensor_t output_tensors[] = { + *p_tensor2 + }; + Qnn_OpConfig_t op_config = ggmlqnn_create_op_config(ggml_op_name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, nullptr, 0, + input_tensors.data(), + input_param_count, output_tensors, + 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config)); + //finalize QNN graph + CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr)); + + //cache QNN graph + qnn_ptensors_t qnn_elementwise_tensors; + qnn_elementwise_tensors.reserve(input_param_count + 1); + + qnn_elementwise_tensors.push_back(p_tensor0); + if (2 == input_param_count) { + qnn_elementwise_tensors.push_back(p_tensor1); + } + qnn_elementwise_tensors.push_back(p_tensor2); + auto graph_item = std::make_tuple(graph_handle, qnn_elementwise_tensors); + ctx->qnn_singlenode_graph_map[graph_name] = graph_item; + } + + if (enable_npu_rpc) { + uint8_t * qnn_buffer_0 = static_cast(instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*p_tensor0)->memHandle)); + GGMLHEXAGON_LOG_DEBUG("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0); + if (nullptr != qnn_buffer_0) { + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + } + + if (2 == input_param_count) { + uint8_t * qnn_buffer_1 = static_cast(instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*p_tensor1)->memHandle)); + GGMLHEXAGON_LOG_DEBUG("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1); + if (nullptr != qnn_buffer_1) { + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } + } + } else { + QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)}; + if (2 == input_param_count) { + QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)}; + } + QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)}; + } + + qnn_tensors_t input_tensors; + input_tensors.reserve(input_param_count); + input_tensors.push_back(*p_tensor0); + if (2 == input_param_count) { + input_tensors.push_back(*p_tensor1); + } + Qnn_Tensor_t output_tensors[] = { + *p_tensor2 + }; + CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, + input_tensors.data(), input_param_count, + output_tensors, 1, + nullptr, nullptr)); + if (enable_npu_rpc) { + uint8_t * qnn_buffer_2 = static_cast(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle)); + if (nullptr != qnn_buffer_2) { + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } + } + + op_perf.info(); +} + +/* + * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend + * various UT has verified and succeed but failed in CT of test-backend-ops + * + * the logic of ggmlqnn_compute_mul_mat_4d is similar to ggmlqnn_compute_mul_mat but much more complicated + * than ggmlqnn_compute_mul_mat, so it's a standalone function. + * it will be combined with ggmlqnn_compute_mul_mat in the future + */ +static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_tensor * op) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + qnn_instance * instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst); + GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4); + + hexagon_perf op_perf("ggmlqnn_compute_mul_mat_4d"); + op_perf.start(); + + std::string graph_name; + ggmlhexagon_get_opkey_from_op(op, graph_name); + GGMLHEXAGON_LOG_DEBUG("graph name %s\n", graph_name.c_str()); + + ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst); + + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * p_tensor0 = nullptr; + Qnn_Tensor_t * p_reshape0_out = nullptr; + Qnn_Tensor_t * p_tile0_out = nullptr; + Qnn_Tensor_t * p_tensor1 = nullptr; + Qnn_Tensor_t * p_permute1_out = nullptr; + Qnn_Tensor_t * p_reshape1_out = nullptr; + Qnn_Tensor_t * p_matmul_out = nullptr; + Qnn_Tensor_t * p_reshape2_out = nullptr; + + if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) { + qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name]; + graph_handle = std::get<0>(graph_item); + qnn_ptensors_t & tensors = std::get<1>(graph_item); + p_tensor0 = tensors[0]; + p_reshape0_out = tensors[1]; + p_tile0_out = tensors[2]; + p_tensor1 = tensors[3]; + p_permute1_out = tensors[4]; + p_reshape1_out = tensors[5]; + p_matmul_out = tensors[6]; + p_reshape2_out = tensors[7]; + } else { + CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), NULL, &graph_handle)); + + // Define dimensions + uint32_t K = src0->ne[0]; // Inner dimension + uint32_t M = src0->ne[1]; // Rows of src0 + uint32_t N = src1->ne[1]; // Columns of src1 + uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch + uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output) + + // Validate K only + GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match + + // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K] + uint32_t src0_dims[] = {static_cast(src0->ne[3]), static_cast(src0->ne[2]), + static_cast(src0->ne[1]), static_cast(src0->ne[0]) + }; + p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, "input0", + QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4, + src0_dims, nullptr, 0); + + // Reshape src0 to [B0, M, K] + uint32_t reshape0_out_dims[] = {B0, M, K}; + p_reshape0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape0_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3, + reshape0_out_dims, nullptr, 0); + + Qnn_Tensor_t reshape0_inputs[] = {*p_tensor0}; + Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out}; + Qnn_OpConfig_t reshape0_op = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_RESHAPE, nullptr, 0, + reshape0_inputs, 1, reshape0_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op)); + + // Tile src0 to match B1: [B0, M, K] -> [B1, M, K] + uint32_t tile0_out_dims[] = {B1, M, K}; + p_tile0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile0_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3, + tile0_out_dims, nullptr, 0); + + uint32_t tile_multiples[] = {B1 / B0, 1, 1}; + uint32_t tile_dims[] = {3}; + Qnn_Tensor_t * p_tile_multiples = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile_multiples", + QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, + tile_dims, tile_multiples, sizeof(tile_multiples)); + + Qnn_Param_t tile_params[] = {{.paramType = QNN_PARAMTYPE_TENSOR, .name = "multiples", .tensorParam = *p_tile_multiples}}; + Qnn_Tensor_t tile0_inputs[] = {*p_reshape0_out}; + Qnn_Tensor_t tile0_outputs[] = {*p_tile0_out}; + Qnn_OpConfig_t tile0_op = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TILE, tile_params, 1, + tile0_inputs, 1, tile0_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op)); + + // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K] + uint32_t src1_dims[] = {static_cast(src1->ne[3]), static_cast(src1->ne[2]), + static_cast(src1->ne[1]), static_cast(src1->ne[0]) + }; + p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, "input1", + QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4, + src1_dims, nullptr, 0); + + + // Permute src1 to [B1, H1, K, N] + uint32_t perm_data[] = {0, 1, 3, 2}; + uint32_t perm_dims[] = {4}; + Qnn_Tensor_t * p_perm = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "perm", + QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, + perm_dims, perm_data, sizeof(perm_data)); + + uint32_t permute1_out_dims[] = {static_cast(src1->ne[3]), static_cast(src1->ne[2]), + static_cast(src1->ne[0]), static_cast(src1->ne[1]) + }; + p_permute1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "permute1_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4, + permute1_out_dims, nullptr, 0); + + Qnn_Param_t permute1_params[] = {{.paramType = QNN_PARAMTYPE_TENSOR, .name = "perm", .tensorParam = *p_perm}}; + Qnn_Tensor_t permute1_inputs[] = {*p_tensor1}; + Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out}; + Qnn_OpConfig_t permute1_op = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, permute1_params, 1, + permute1_inputs, 1, permute1_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op)); + + // Reshape src1 to [B1, K, N] + uint32_t reshape1_out_dims[] = {B1, K, N}; + p_reshape1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape1_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3, + reshape1_out_dims, nullptr, 0); + + Qnn_Tensor_t reshape1_inputs[] = {*p_permute1_out}; + Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out}; + Qnn_OpConfig_t reshape1_op = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_RESHAPE, nullptr, 0, + reshape1_inputs, 1, reshape1_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op)); + + // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N] + uint32_t matmul_out_dims[] = {B1, M, N}; + p_matmul_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "matmul_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3, + matmul_out_dims, nullptr, 0); + + Qnn_Tensor_t matmul_inputs[] = {*p_tile0_out, *p_reshape1_out}; + Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out}; + Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, nullptr, 0, + matmul_inputs, 2, matmul_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op)); + + // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N] + uint32_t reshape2_out_dims[] = {static_cast(dst->ne[3]), static_cast(dst->ne[2]), + static_cast(dst->ne[1]), static_cast(dst->ne[0]) + }; + p_reshape2_out = ggmlqnn_create_general_tensor(instance, graph_handle, dst, "output", + QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4, + reshape2_out_dims, nullptr, 0); + + Qnn_Tensor_t reshape2_inputs[] = {*p_matmul_out}; + Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out}; + Qnn_OpConfig_t reshape2_op = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_RESHAPE, nullptr, 0, + reshape2_inputs, 1, reshape2_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op)); + + // Finalize + CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL)); + + // Cache + qnn_ptensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1, + p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out + }; + ctx->qnn_singlenode_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors); + } + + // Execute + QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast(ggml_nbytes(src0))}; + QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast(ggml_nbytes(src1))}; + QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast(ggml_nbytes(dst))}; + + Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1}; + Qnn_Tensor_t output_tensors[] = {*p_reshape2_out}; + CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2, output_tensors, 1, NULL, NULL)); + + op_perf.info(); +} + +/* + * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs + * using the QNN backend. this function performs matrix multiplication of the input tensor + * `src1` and the weight tensor `src0`, handling transposing, and quantization as needed, + * and stores the result in the destination tensor `dst`. + * + there are two key-points in properly handling how to offload mulmat to the QNN + 1. transpose + a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from: + struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); + which like this: + +---+---+ + | 0 | 1 | + +---+---+ + | 2 | 3 | + +---+---+ + | 4 | 5 | + +---+---+ + with + ne[0] = 2 + ne[1] = 3 + there are different dimension order between ggml tensor and qnn tensor + + 2. QNN's MatMul can only support input tensors with rank >= 2 + + in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose + operation when offloading mulmat to QNN backend. this implementation will handle transpose + in func ggmlqnn_create_general_tensor() + + * @param ctx the context of backend + * @param op the destination tensor where the result of the matrix multiplication will be stored. + * + * @note the logic of ggmlqnn_compute_mul_mat is similar to ggmlqnn_compute_op_two_tensors but much more complicated + * than ggmlqnn_compute_op_two_tensors. so it's a standalone function. accordingly, this is another + * typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute + * time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds + * of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend + * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1 + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, Q6_K...) + * and src1 is F32, src0 -> f32 in src0', then src0' * src1 +*/ +static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_tensor * op) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + qnn_instance * instance = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * p_tensor0 = nullptr; + Qnn_Tensor_t * p_tensor1 = nullptr; + Qnn_Tensor_t * p_tensor2 = nullptr; + Qnn_Tensor_t * p_param_tensor = nullptr; + Qnn_Tensor_t * p_tensor2_transpose = nullptr; + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + const enum ggml_type src0_type = src0->type; + const uint32_t src0_rank = ggml_n_dims(src0); + const uint32_t src1_rank = ggml_n_dims(src1); + const char * ggml_original_opname = ggml_op_name(op->op); + ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst); + + std::string graph_name; + ggmlhexagon_get_opkey_from_op(op, graph_name); + + int input_size = ggml_nbytes(src0); + if (nullptr != src1) + input_size += ggml_nbytes(src1); + hexagon_perf op_perf(graph_name, ggml_original_opname, input_size, ggml_nbytes(dst)); + op_perf.start(); + + GGML_ASSERT(src0_rank == src1_rank); + GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy + if (4 == src0_rank) { + return ggmlqnn_compute_mul_mat_4d(ctx, op); + } + + void * wdata = ggmlhexagon_type_trait(ctx, op); + const size_t desired_size = ctx->desired_size; + + if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) { + //retrieve computational resource from cached QNN graph + qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name]; + graph_handle = std::get<0>(graph_item); + qnn_ptensors_t &tensors = std::get<1>(graph_item); + p_tensor0 = tensors[0]; + p_tensor1 = tensors[1]; + p_tensor2 = tensors[2]; + p_param_tensor = tensors[3]; + p_tensor2_transpose = tensors[4]; + } else { + //create QNN graph + GGMLHEXAGON_LOG_VERBOSE("graph name %s", graph_name.c_str()); + error = instance->init_qnn_graph(graph_name, static_cast(ctx->device), + g_hexagon_appcfg.vtcm_size_in_mb, + g_hexagon_appcfg.hvx_threads); + if (QNN_SUCCESS != error) { + GGMLHEXAGON_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", + graph_name.c_str(), error); + return; + } + graph_handle = instance->get_qnn_graph_handle(); + + //create computational tensor + p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, nullptr, + QNN_TENSOR_TYPE_APP_WRITE, + QNN_DATATYPE_FLOAT_32, src0_rank, + nullptr, nullptr, 0); + p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, nullptr, + QNN_TENSOR_TYPE_APP_WRITE, + QNN_DATATYPE_FLOAT_32, src0_rank, + nullptr, nullptr, 0); + p_tensor2 = ggmlqnn_create_general_tensor(instance, graph_handle, dst, nullptr, + QNN_TENSOR_TYPE_APP_READ, + QNN_DATATYPE_FLOAT_32, src0_rank, + nullptr, nullptr, 0); + + //create param tensor for offload 2d/3d/4d matrix multiplication + const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = { + {0}, + {1, 0}, + {0, 2, 1}, + {0, 1, 3, 2}, + }; + uint32_t param_tensor_dims[1] = {src0_rank}; + p_param_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "param", + QNN_TENSOR_TYPE_STATIC, + QNN_DATATYPE_UINT_32, 1, + param_tensor_dims, + (void *) (param_tensor_data[src0_rank - 1]), + src0_rank * sizeof(uint32_t)); + + //create transpose tensor + p_tensor2_transpose = ggmlqnn_create_general_tensor(instance, graph_handle, dst, + "transpose", + QNN_TENSOR_TYPE_NATIVE, + QNN_DATATYPE_FLOAT_32, src0_rank, + nullptr, nullptr, 0, true); + + //compose QNN graph: add mulmat node + Qnn_Param_t out_0_params[] = { + {.paramType = QNN_PARAMTYPE_SCALAR, .name = QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = { + .dataType = QNN_DATATYPE_BOOL_8, .bool8Value = 1}}}; + Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1}; + Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose}; + Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("mulmat_opconfig", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, out_0_params, 1, + out_0_inputs, 2, out_0_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_0)); + + //compose QNN graph: add transpose node + Qnn_Param_t out_trans1_0_params[] = { + {.paramType = QNN_PARAMTYPE_TENSOR, .name = "perm", .tensorParam = *p_param_tensor}}; + Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose}; + Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2}; + Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("mulmat_transpose_opconfig", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, + out_trans1_0_params, 1, + out_trans1_0_inputs, 1, + out_trans1_0_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_trans1_0)); + + //finalize QNN graph + CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr)); + + //cache QNN graph + qnn_ptensors_t ggml_op_mulmat_tensors; + ggml_op_mulmat_tensors.reserve(5); + ggml_op_mulmat_tensors.push_back(p_tensor0); + ggml_op_mulmat_tensors.push_back(p_tensor1); + ggml_op_mulmat_tensors.push_back(p_tensor2); + ggml_op_mulmat_tensors.push_back(p_param_tensor); + ggml_op_mulmat_tensors.push_back(p_tensor2_transpose); + auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors); + ctx->qnn_singlenode_graph_map[graph_name] = graph_item; + } + + if (src0_type != GGML_TYPE_F32) { + QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast(desired_size)}; + } else { + QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)}; + } + QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)}; + QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *p_tensor0, + *p_tensor1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *p_tensor2 + }; + CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr)); + op_perf.info(); +} + +static void ggmlqnn_compute_repeat(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_div(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_leaky_relu(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_concat(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_arange(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_sqr(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_clamp(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_scale(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_argsort(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_group_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_acc(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_sum_rows(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_pad(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_pool2d(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_dup(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_rms_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_im2col(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_timestep_embedding(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_cpy(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + ggmlqnn_compute_dup(ctx, dst); +} + +static void ggmlqnn_compute_softmax(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_get_rows(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_rope(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +// ================================================================================================= +// section-7: cDSP helper function +// ================================================================================================= +static const char * ggmlhexagon_get_dsp_name(int domain_id) { + switch (domain_id) { + case HEXAGON_ADSP: + return "Hexagon-aDSP"; + case HEXAGON_MDSP: + return "Hexagon-mDSP"; + case HEXAGON_SDSP: + return "Hexagon-sDSP"; + case HEXAGON_CDSP: + return "Hexagon-cDSP"; + case HEXAGON_CDSP1: + return "Hexagon-cDSP1"; + default: + return "Hexagon-unknown"; + } +} + +static int ggmlhexagon_pd_status_notifier_callback(void * context, int domain, int session, remote_rpc_status_flags_t status){ + int error = AEE_SUCCESS; + switch (status){ + case FASTRPC_USER_PD_UP: + GGMLHEXAGON_LOG_DEBUG("PD is up\n"); + break; + case FASTRPC_USER_PD_EXIT: + GGMLHEXAGON_LOG_DEBUG("PD closed\n"); + break; + case FASTRPC_USER_PD_FORCE_KILL: + GGMLHEXAGON_LOG_DEBUG("PD force kill\n"); + break; + case FASTRPC_USER_PD_EXCEPTION: + GGMLHEXAGON_LOG_DEBUG("PD exception\n"); + break; + case FASTRPC_DSP_SSR: + GGMLHEXAGON_LOG_DEBUG("DSP SSR\n"); + break; + default : + error = AEE_EBADITEM; + break; + } + return error; +} + +static domain * ggmlhexagon_get_domain(int domain_id) { + int size = sizeof(hexagon_supported_domains) / sizeof(domain); + + for (int i = 0; i < size; i++) { + if (hexagon_supported_domains[i].id == domain_id) + return &hexagon_supported_domains[i]; + } + + return nullptr; +} + +static bool ggmlhexagon_is_cdsp(int domain_id) { + return (domain_id == HEXAGON_CDSP) || (domain_id == HEXAGON_CDSP1); +} + +static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) { + int size = sizeof(hexagon_supported_domains) / sizeof(domain); + + if (0 != compute_only) { + return ggmlhexagon_is_cdsp(domain_id); + } + + for (int i = 0; i < size; i++) { + if (hexagon_supported_domains[i].id == domain_id) + return true; + } + + return false; +} + +static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) { + int hexagon_err = AEE_SUCCESS; + int ss_info = 0; + void * buffer = nullptr; + ss_info = strcmp(domain_type, "NSP")? HPASS: NSP; + system_req_payload req; + memset(&req, 0, sizeof(system_req_payload)); + req.id = FASTRPC_GET_DOMAINS; + req.sys.domains = nullptr; + fastrpc_domain * domain = nullptr; + + if (ss_info != 0) { + req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info); + } else { + req.sys.flags =0; + } + +#ifdef _WIN32 + hexagon_err = AEE_EUNSUPPORTED; + goto bail; +#endif + + hexagon_err = remote_system_request(&req); + if (hexagon_err != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d", hexagon_err); + goto bail; + } + //allocate memory for domain-info array + req.sys.max_domains = req.sys.num_domains; + buffer = calloc(req.sys.num_domains, sizeof(fastrpc_domain)); + if (nullptr == buffer) { + hexagon_err = AEE_ENOMEMORY; + GGMLHEXAGON_LOG_DEBUG("unable to allocate memory for req.sys.domains"); + goto bail; + } + req.sys.domains = static_cast(buffer); + hexagon_err = remote_system_request(&req); + if (hexagon_err != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d.\n", hexagon_err); + goto bail; + } + + for (int i = 0; i < req.sys.num_domains; i++) { + //verify that only requested type domains were returned + domain = &req.sys.domains[i]; + if (domain->type != ss_info) { + hexagon_err = -1; + GGMLHEXAGON_LOG_DEBUG("incorrect data received from remote_system_request.\n"); + goto bail; + } + } + *domains_info = req.sys.domains; + *num_domains = req.sys.num_domains; + +bail: + if (hexagon_err && !req.sys.domains) { + free(req.sys.domains); + } + return hexagon_err; +} + +static int ggmlhexagon_get_dsp_support(int * domain) { + int hexagon_error = AEE_SUCCESS; + *domain = HEXAGON_CDSP; + + if (remote_handle_control) { + struct remote_dsp_capability dsp_capability_domain = {HEXAGON_CDSP, DOMAIN_SUPPORT, 0}; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + goto bail; + } + + if (0 == dsp_capability_domain.capability) { + dsp_capability_domain.domain = HEXAGON_ADSP; + dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT; + dsp_capability_domain.capability = 0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); + if(dsp_capability_domain.capability) { + *domain = HEXAGON_ADSP; + } + } + + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("get_dsp_support failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capability) { + int hexagon_error = AEE_SUCCESS; + *capability = 0; + + if (attr == VTCM_PAGE || attr == VTCM_COUNT) { + } else { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_DEBUG("unsupported attr, only VTCM_PAGE and VTCM_COUNT supported"); + goto bail; + } + + if (remote_handle_control) { + if (domain == HEXAGON_ADSP || domain == HEXAGON_CDSP) { + /* + * query the DSP for VTCM information + * since the ADSP does not have a dedicated VTCM, we expect the output to be 0 + */ + struct remote_dsp_capability dsp_capability_vtcm_dsp; + dsp_capability_vtcm_dsp.domain = (uint32_t)domain; + dsp_capability_vtcm_dsp.attribute_ID = attr; + dsp_capability_vtcm_dsp.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + GGMLHEXAGON_LOG_DEBUG("running the use case without checking the capability"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (hexagon_error == AEE_SUCCESS) { + *capability = dsp_capability_vtcm_dsp.capability; + } else { + GGMLHEXAGON_LOG_DEBUG("get_vtcm_info failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_DEBUG("unsupported domain %d", domain); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static bool ggmlhexagon_is_unsignedpd_supported(int domain_id) { + int hexagon_error = AEE_SUCCESS; + if (remote_handle_control) { + struct remote_dsp_capability dsp_capability_domain = {static_cast(domain_id), UNSIGNED_PD_SUPPORT, 0}; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device. Falling back to signed pd"); + return false; + } + + if (hexagon_error) { + GGMLHEXAGON_LOG_WARN("error 0x%x: FastRPC Capability API failed. falling back to signed pd", hexagon_error); + return false; + } + + if (dsp_capability_domain.capability == 1) { + return true; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device.falling back to signed pd"); + return false; + } + + return false; +} + +static bool ggmlhexagon_get_unsignedpd_support(void) { + return ggmlhexagon_is_unsignedpd_supported(HEXAGON_CDSP); +} + +static bool ggmlhexagon_is_async_fastrpc_supported(int domain) { + int hexagon_error = AEE_SUCCESS; + if (remote_handle_control) { + if (domain == HEXAGON_CDSP) { + /* + * Query the DSP for ASYNC_FASTRPC_SUPPORT information + * Async fastrpc is supported only on CDSP + */ + struct remote_dsp_capability dsp_capability_async_support; + dsp_capability_async_support.domain = (uint32_t)domain; + dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT; + dsp_capability_async_support.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (dsp_capability_async_support.capability == 1) { + return true; + } + + if (hexagon_error != AEE_SUCCESS){ + GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_WARN("async FastRPC is not supported on domain %d", domain); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return false; +} + +static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int latency) { + int hexagon_error = AEE_SUCCESS; + + if (remote_handle_control) { + struct remote_rpc_control_latency data; +/* + qos | latency + ----------------------- + RPC_PM_QOS | 100 + RPC_POLL_QOS | 1000 +*/ + data.enable = qos; + data.latency = latency; + hexagon_error = remote_handle64_control(handle, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data)); + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error); + goto bail; + } else { + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("set rpc qos %d, latency %d\n", qos, latency); + } else { + GGMLHEXAGON_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency); + } + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return; +} + +/** + * set FastRPC thread priority (default unchanged at 192) + * priority values range from 1 to 255, with smaller values representing higher priorities + * Unprivileged clients: 64 through 254 (cDSP only) + * Privileged clients: 1 through 254 + * + * ref:file:///opt/qcom/Hexagon_SDK/6.2.0.1/docs/software/system_integration.html#priority-levels + */ +static int ggmlhexagon_set_priority(int domain, int priority) { + int err = 0; + + if (priority < 1) { + priority = 1; + } + if (priority > 255) { + priority = 255; + } + + if (remote_session_control) { + struct remote_rpc_thread_params data; + data.domain = domain; + data.prio = priority; + data.stack_size = -1; + err = remote_session_control(FASTRPC_THREAD_PARAMS, (void *)&data, sizeof(data)); + if (err != AEE_SUCCESS) { + GGMLHEXAGON_LOG_WARN("remote_session_control failed with 0x%x when setting thread priority\n", err); + } else { + GGMLHEXAGON_LOG_VERBOSE("thread priority set to %d\n", priority); + } + } else { + GGMLHEXAGON_LOG_WARN("cannot set thread priority\n"); + } + return err; +} + +static bool ggmlhexagon_is_status_notification_supported(int domain) { + int hexagon_error = AEE_SUCCESS; + + if (remote_handle_control) { + /* + * Query the DSP for STATUS_NOTIFICATION_SUPPORT information + * DSP User PD status notification Support + */ + struct remote_dsp_capability dsp_capability_status_notification_support; + dsp_capability_status_notification_support.domain = (uint32_t)domain; + dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT; + dsp_capability_status_notification_support.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (1 == dsp_capability_status_notification_support.capability) { + return true; + } + + if (hexagon_error != AEE_SUCCESS){ + GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return false; +} + +static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t * capability) { + int hexagon_error = AEE_SUCCESS; + *capability = 0; + + if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_WARN("unsupported attr, only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported"); + goto bail; + } + + if (remote_handle_control) { + if (domain == HEXAGON_CDSP) { + /* + * Query the DSP for HMX SUPPORT information + * HMX is supported on CDSP only + */ + struct remote_dsp_capability dsp_capability_hmx_dsp; + dsp_capability_hmx_dsp.domain = (uint32_t)domain; + dsp_capability_hmx_dsp.attribute_ID = attr; + dsp_capability_hmx_dsp.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } + else if (hexagon_error == AEE_SUCCESS) { + *capability = dsp_capability_hmx_dsp.capability; + } else { + GGMLHEXAGON_LOG_DEBUG("get_hmx_support_info failed with Error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_DEBUG("HMX support is not there for domain %d", domain); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) { + int hexagon_error = AEE_SUCCESS; + *capability = 0; + if(remote_handle_control) { + /* + * Query the Hexagon processor architecture version information + */ + struct remote_dsp_capability dsp_capability_arch_ver; + dsp_capability_arch_ver.domain = (uint32_t)domain; + dsp_capability_arch_ver.attribute_ID = ARCH_VER; + dsp_capability_arch_ver.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (hexagon_error == AEE_SUCCESS) { + *capability = dsp_capability_arch_ver.capability & 0xFF; + } else { + GGMLHEXAGON_LOG_DEBUG("get_hex_arch_ver failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t * capability) +{ + int hexagon_error = AEE_SUCCESS; + *capability = 0; + if (attr == HVX_SUPPORT_64B) { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_DEBUG("latest targets have 128 byte HVX register, use HVX_SUPPORT_128B instead of HVX_SUPPORT_64B"); + goto bail; + } + + if (attr != HVX_SUPPORT_128B) { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_DEBUG("unsupported attr. only HVX_SUPPORT_128B supported"); + goto bail; + } + + if (remote_handle_control) { + if (domain == HEXAGON_CDSP) { + /* + * Query the DSP for HVX SUPPORT information + * HVX is supported on CDSP only + */ + struct remote_dsp_capability dsp_capability_hvx_dsp; + dsp_capability_hvx_dsp.domain = (uint32_t)domain; + dsp_capability_hvx_dsp.attribute_ID = attr; + dsp_capability_hvx_dsp.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF)==(AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (hexagon_error == AEE_SUCCESS) { + *capability = dsp_capability_hvx_dsp.capability; + } else { + GGMLHEXAGON_LOG_DEBUG("failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_DEBUG("HVX support is not available on domain %d", domain); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static int ggmlhexagon_request_status_notifications(int domain_id, void * context, notify_callback_fn call_back_fn) { + int hexagon_error = AEE_SUCCESS; + struct remote_rpc_notif_register notif; + bool status_notification_support; + + notif.context = context; + notif.domain = domain_id; + notif.notifier_fn = call_back_fn; + + status_notification_support = ggmlhexagon_is_status_notification_supported(domain_id); + if (status_notification_support) { + hexagon_error = remote_session_control(FASTRPC_REGISTER_STATUS_NOTIFICATIONS, (void*)¬if, sizeof(notif)); + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed to enable status notifications", hexagon_error); + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + } + + return hexagon_error; +} + +static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) { + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; +#ifdef SD_USE_HEXAGON // for stable-diffusion.cpp + size_t probe_slots[] = {1024, 1536, 2000, 2048, 1024 + 2048, 4096}; +#else + size_t probe_slots[] = {1024, 1536, 2000, 2048}; +#endif + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + + if (nullptr == ctx) + return 1; + + for (size_t idx = 0; idx < probe_counts; idx++) { +#ifdef SD_USE_HEXAGON // for stable-diffusion.cpp + rpc_buffer = static_cast(rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB))); +#else + rpc_buffer = static_cast(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB))); +#endif + if (nullptr == rpc_buffer) { + GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + rpcmem_free(rpc_buffer); + rpc_buffer = nullptr; + } + } + ctx->rpc_mempool_capacity = candidate_size * SIZE_IN_MB; + GGMLHEXAGON_LOG_DEBUG("rpc memory capacity %ld(%d MiB) for device %d", + ctx->rpc_mempool_capacity, ctx->rpc_mempool_capacity / SIZE_IN_MB, ctx->device); + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB); + } else { + GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB); + } + + if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + GGML_ASSERT(ctx->rpc_mempool_capacity > (8 * SIZE_IN_MB)); + ctx->rpc_mempool_len = ctx->rpc_mempool_capacity - (8 * SIZE_IN_MB); +#ifdef SD_USE_HEXAGON // use rpcmem_alloc2 to alloc 2+ GiB memory, it's a workaround to make stablediffusion.cpp happy + ctx->rpc_mempool = rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len); +#else + //FIXME: it seems there is unknown issue with 2+ GiB memory pool + ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len); +#endif + if (nullptr == ctx->rpc_mempool) { + GGMLHEXAGON_LOG_WARN("alloc rpc memorypool %ld(%d MiB) failed", ctx->rpc_mempool_len, ctx->rpc_mempool_capacity / SIZE_IN_MB); + return 2; + } else { + GGMLHEXAGON_LOG_DEBUG("alloc rpc memorypool %p successfully %ld(%d MiB)", + ctx->rpc_mempool, ctx->rpc_mempool_len, + ctx->rpc_mempool_len / SIZE_IN_MB); + } + ctx->rpc_mempool_handle = rpcmem_to_fd(ctx->rpc_mempool); + GGMLHEXAGON_LOG_DEBUG("rpc mempool handle %d", ctx->rpc_mempool_handle); + remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle); + } + + return 0; +} + +static void ggmlhexagon_deinit_rpcmempool(ggml_backend_hexagon_context * ctx) { + if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + if (ctx->rpc_mempool) { + //deregister rpc memory pool + remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, -1); + GGMLHEXAGON_LOG_DEBUG("free rpc mempool %p", ctx->rpc_mempool); + rpcmem_free(ctx->rpc_mempool); + ctx->rpc_mempool = nullptr; + ctx->rpc_mempool_len = 0; + ctx->rpc_mempool_capacity = 0; + } + } +} + +static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx) { + uint32_t dsp_version = 0; + ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version); + + if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) { + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("dsp arch version 0x%x", dsp_version); + } else { + GGMLHEXAGON_LOG_INFO("dsp arch version 0x%x", dsp_version); + } + //0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79 + size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version); + GGMLHEXAGON_LOG_DEBUG("dsp arch version %d", htp_arch); + struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(htp_arch); + if (nullptr != socinfo) { + //got fully description of SoC when hwaccel approach is HWACCEL_CDSP + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch)); + } else { + GGMLHEXAGON_LOG_INFO("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch)); + } + } + } else { + GGMLHEXAGON_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version); + } + + uint32_t vtcm_count = 0; + uint32_t vtcm_page = 0; + ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count); + ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page); + + uint32_t hmx_depth = 0; + uint32_t hmx_spatial = 0; + ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth); + ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial); + + uint32_t hvx_support_128b = 0; + ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b); + + if (ggmlhexagon_is_llamabench_running()) { + //make llama-bench happy + GGMLHEXAGON_LOG_VERBOSE("vtcm_count %d", vtcm_count); + GGMLHEXAGON_LOG_VERBOSE("vtcm_page %d", vtcm_page); + GGMLHEXAGON_LOG_VERBOSE("hmx_depth %d", hmx_depth); + GGMLHEXAGON_LOG_VERBOSE("hmx_spatial %d", hmx_spatial); + GGMLHEXAGON_LOG_VERBOSE("hvx_support_128b %d", hvx_support_128b); + GGMLHEXAGON_LOG_VERBOSE("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support()); + GGMLHEXAGON_LOG_VERBOSE("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id)); + } else { + GGMLHEXAGON_LOG_INFO("vtcm_count %d", vtcm_count); + GGMLHEXAGON_LOG_INFO("vtcm_page %d", vtcm_page); + GGMLHEXAGON_LOG_INFO("hmx_depth %d", hmx_depth); + GGMLHEXAGON_LOG_INFO("hmx_spatial %d", hmx_spatial); + GGMLHEXAGON_LOG_INFO("hvx_support_128b %d", hvx_support_128b); + GGMLHEXAGON_LOG_INFO("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support()); + GGMLHEXAGON_LOG_INFO("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id)); + } +} + +static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) { + int hexagon_error = AEE_SUCCESS; + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("enter %s", __func__); + } else { + GGMLHEXAGON_LOG_INFO("enter %s", __func__); + } + if (0 != ctx->ggmlop_handle) { + hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle); + if (AEE_SUCCESS != hexagon_error) { + GGMLHEXAGON_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error); + } + ctx->ggmlop_handle = 0; + } + + ggmlhexagon_deinit_rpcmempool(ctx); + + ctx->domain_id = -1; + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("leave %s", __func__); + } else { + GGMLHEXAGON_LOG_INFO("leave %s", __func__); + } +} + +static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + int hexagon_error = AEE_SUCCESS; + + int domain_id = HEXAGON_CDSP; + const char * domain_type = "NSP"; + + int unsignedpd_flag = 1; + bool is_unsignedpd_enabled = false; + int use_logical_id = 0; + int core_id = -1; + fastrpc_domain * domains_info = NULL; + int num_domains = -1; + + domain * my_domain = NULL; + char * uri = NULL; + + char * ggmlop_domain_uri = NULL; + int ggmlop_domain_uri_len = 0; + + if (nullptr == ctx) + return 1; + GGMLHEXAGON_LOG_DEBUG("init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device)); + if (0 != ctx->ggmlop_handle) { + GGMLHEXAGON_LOG_DEBUG("already init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device)); + return 0; + } + ctx->ggmlop_handle = 0; + + if (-1 == domain_id) { + if (nullptr != domain_type) { + if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) { + GGMLHEXAGON_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type); + goto bail; + } else { + hexagon_error = ggmlhexagon_get_domains_info(domain_type, &num_domains, &domains_info); + if (hexagon_error == AEE_EUNSUPPORTED) { + GGMLHEXAGON_LOG_DEBUG("API is not supported on this target so cannot get domains info from the device. falling back to legacy approach of using default domain id"); + hexagon_error = ggmlhexagon_get_dsp_support(&domain_id); + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error); + } + } else if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("error in getting domains information"); + goto bail; + } else { + if (core_id != -1) { + if (core_id < 0 || core_id >= num_domains) { + GGMLHEXAGON_LOG_DEBUG("invalid core_id = %d for %s. core_id should be between 0 to %d", core_id, domain_type, num_domains - 1); + hexagon_error = AEE_EBADPARM; + goto bail; + } + } else { + core_id = 0; + } + use_logical_id = 1; + domain_id = domains_info[core_id].id; + } + } + } else { + GGMLHEXAGON_LOG_DEBUG("DSP domain is not provided, retrieving DSP information using Remote APIs"); + hexagon_error = ggmlhexagon_get_dsp_support(&domain_id); + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error); + } + } + } + + if (0 == use_logical_id) { + if (!ggmlhexagon_is_valid_domain_id(domain_id, 0)) { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_DEBUG("error 0x%x: invalid domain %d", hexagon_error, domain_id); + goto bail; + } + + my_domain = ggmlhexagon_get_domain(domain_id); + if (nullptr == my_domain) { + GGMLHEXAGON_LOG_DEBUG("unable to get domain struct %d", domain_id); + goto bail; + } + uri = my_domain->uri; + } + GGMLHEXAGON_LOG_DEBUG("temporary domain uri=%s\n", uri); + + if (1 == unsignedpd_flag) { + is_unsignedpd_enabled = ggmlhexagon_is_unsignedpd_supported(domain_id); + if (!is_unsignedpd_enabled) { + GGMLHEXAGON_LOG_DEBUG("overriding user request for unsigned PD, only signed offload is allowed on domain %d", domain_id); + unsignedpd_flag = 0; + } + } + + ctx->domain_id = domain_id; + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_VERBOSE("unsignedpd_enabled %d", is_unsignedpd_enabled); + } else { + GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled); + } + if (is_unsignedpd_enabled) { + if (remote_session_control) { + struct remote_rpc_control_unsigned_module data; + data.enable = 1; + data.domain = domain_id; + hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data)); + GGMLHEXAGON_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error); + if (AEE_SUCCESS != hexagon_error) { + GGMLHEXAGON_LOG_WARN("error 0x%x: remote_session_control failed", hexagon_error); + } + } else { + GGMLHEXAGON_LOG_DEBUG("unsigned PD not supported on this device"); + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control interface is not supported on this device", hexagon_error); + } + } + + hexagon_error = ggmlhexagon_request_status_notifications(domain_id, (void *)STATUS_CONTEXT, ggmlhexagon_pd_status_notifier_callback); + if (AEE_SUCCESS != hexagon_error) { + if (AEE_EUNSUPPORTEDAPI != hexagon_error) { + GGMLHEXAGON_LOG_WARN("error 0x%x: hexagon_request_status_notifications failed", hexagon_error); + } + GGMLHEXAGON_LOG_WARN("error 0x%x: failed to compute on domain %d", hexagon_error, domain_id); + goto bail; + } + ggmlhexagon_set_priority(domain_id, 160); + + ggmlop_domain_uri_len = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN; + ggmlop_domain_uri = (char *)malloc(ggmlop_domain_uri_len); + if (NULL == ggmlop_domain_uri) { + goto bail; + } + snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri); + GGMLHEXAGON_LOG_DEBUG("ggmlop domain uri:%s", ggmlop_domain_uri); + hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle); + if (AEE_SUCCESS == hexagon_error) { + if (ggmlhexagon_is_llamabench_running()) { + GGMLHEXAGON_LOG_VERBOSE("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_VERBOSE("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently"); + } else { + GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_INFO("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently"); + } + ggmlhexagon_probe_dspinfo(ctx); + //FIXME: re-use this function to pass thread_counts info to code on cDSP side before fully understand qidl mechanism + //ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts); + //backward compatible with previous codes on cDSP side + ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, g_hexagon_appcfg.mulmat_algotype, g_hexagon_appcfg.thread_counts); + ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100); + int result = ggmlhexagon_init_rpcmempool(ctx); + if (0 != result) { + GGMLHEXAGON_LOG_INFO("failed to init rpc mempool"); + goto bail; + } + } else { + GGMLHEXAGON_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id, + ggmlhexagon_get_dsp_name(domain_id)); + goto bail; + } + + //make sure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP) + memcpy(g_hexagon_mgr[ctx->device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP")); + + if (NULL != ggmlop_domain_uri) { + free(ggmlop_domain_uri); + ggmlop_domain_uri = NULL; + } + return 0; + +bail: + if (ggmlop_domain_uri) { + free(ggmlop_domain_uri); + } + + ggmlhexagon_deinit_cdsp(ctx); + + return -1; +} + +static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_tensor * op) { + //skip sanity check because already checked in other place + struct dsptensor dsptensor_0; + struct dsptensor dsptensor_1; + struct dsptensor dsptensor_2; + std::string op_name; + const char * ggml_opname = ggml_op_name(op->op); + ggmlhexagon_get_opkey_from_op(op, op_name); + + int hexagon_error = AEE_SUCCESS; + ggmlhexagon_op_func_t op_func = nullptr; + size_t input_tensor_count = 2; + + ggml_tensor * src0 = op->src[0]; + ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + int input_size = ggml_nbytes(src0); + if (nullptr != src1) + input_size += ggml_nbytes(src1); + hexagon_perf op_perf(op_name, ggml_opname, input_size, ggml_nbytes(dst)); + op_perf.start(); + + input_tensor_count = ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].input_param_count; + op_func = ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].dsp_op_func; + if (nullptr == op_func) { + GGMLHEXAGON_LOG_DEBUG("op GGML_OP_%s and dsp func %s not supported on cCSP", ggml_op_name(op->op), ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].hexagon_op_name); + return; + } + + //FIXME:try to fully understand the tech detail in qidl: + // qidl is a binary tool to generate some very complicated and hard-to customized bridge-layer codes + // between ARM-AP and cDSP. the mechanism in qidl/FastRPC is exactly similar to mechanism in TEE. + // try to find a better/efficient approach to exchange necessary data between ARM-AP side and cDSP side. + // manually modifying the important data structure ggml_tensor in ggml.h is not make-sense and not acceptable. + std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now(); + dsptensor_0.data = src0->data; + dsptensor_0.data_len = ggml_nbytes(src0); + dsptensor_0.type = src0->type; + + dsptensor_0.ne[0] = src0->ne[0]; + dsptensor_0.ne[1] = src0->ne[1]; + dsptensor_0.ne[2] = src0->ne[2]; + dsptensor_0.ne[3] = src0->ne[3]; + + dsptensor_0.nb[0] = src0->nb[0]; + dsptensor_0.nb[1] = src0->nb[1]; + dsptensor_0.nb[2] = src0->nb[2]; + dsptensor_0.nb[3] = src0->nb[3]; + + if (2 == input_tensor_count) { + GGML_ASSERT(nullptr != src1); + dsptensor_1.data = src1->data; + dsptensor_1.type = src1->type; + dsptensor_1.data_len = ggml_nbytes(src1); + + dsptensor_1.ne[0] = src1->ne[0]; + dsptensor_1.ne[1] = src1->ne[1]; + dsptensor_1.ne[2] = src1->ne[2]; + dsptensor_1.ne[3] = src1->ne[3]; + + dsptensor_1.nb[0] = src1->nb[0]; + dsptensor_1.nb[1] = src1->nb[1]; + dsptensor_1.nb[2] = src1->nb[2]; + dsptensor_1.nb[3] = src1->nb[3]; + } + + dsptensor_2.data = dst->data; + dsptensor_2.data_len = ggml_nbytes(dst); + dsptensor_2.type = dst->type; + + dsptensor_2.ne[0] = dst->ne[0]; + dsptensor_2.ne[1] = dst->ne[1]; + dsptensor_2.ne[2] = dst->ne[2]; + dsptensor_2.ne[3] = dst->ne[3]; + + dsptensor_2.nb[0] = dst->nb[0]; + dsptensor_2.nb[1] = dst->nb[1]; + dsptensor_2.nb[2] = dst->nb[2]; + dsptensor_2.nb[3] = dst->nb[3]; + + memcpy(dsptensor_2.op_params, dst->op_params, GGML_MAX_OP_PARAMS / sizeof(int32_t)); + std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end_time - start_time; + GGMLHEXAGON_LOG_DEBUG("pack duration %llu ns", duration.count()); + + hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2); + if (AEE_SUCCESS != hexagon_error) { + GGMLHEXAGON_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op)); + } + + op_perf.info(); + return; +} + +// ================================================================================================= +// section-8: implementation of ggml-hexagon backend according to specification in ggml backend subsystem +// ================================================================================================= +static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const struct ggml_tensor * op_tensor) { + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context; + GGML_UNUSED(ctx); + if (op_tensor->op == GGML_OP_NONE) { + return true; + } + + if (!ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) { + return false; + } + + const ggml_tensor * src0 = op_tensor->src[0]; + const ggml_tensor * src1 = op_tensor->src[1]; + const int src0_rank = ggml_n_dims(src0); + const int64_t ne00 = src0->ne[0]; + int src1_rank = 0; + if (nullptr != src1) { + src1_rank = ggml_n_dims(src1); + } + switch (op_tensor->op) { + case GGML_OP_ADD: + { + //TODO:workaround approach to fix HWACCEL_CDSP can't works in ASR inference and LLM inference + // with some LLM models in a standard Android APP + if (ne00 < 1024) { + return false; + } + + if (!ggml_are_same_shape(src0, src1)) { + return false; + } + return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + } + case GGML_OP_MUL_MAT: + { + ggmlhexagon_dump_op_info(op_tensor); + if (src0_rank != src1_rank) + return false; + if (src0_rank != 2) + return false; + + if (1 == g_hexagon_appcfg.enable_q_mulmat) { + if (1 == g_hexagon_appcfg.enable_all_q_mulmat) { + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32); + } + + return (src0->type == GGML_TYPE_F32 + || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0 + || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K + ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + } else { + return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && + (op_tensor->type == GGML_TYPE_F32); + } + } + case GGML_OP_SOFT_MAX:{ + if (!ggml_is_contiguous(op_tensor)) + return false; + if (!ggml_are_same_shape(src0, op_tensor)) + return false; + } + case GGML_OP_RMS_NORM: + case GGML_OP_POOL_2D: + { + + ggmlhexagon_dump_op_info(op_tensor); + } + default: + break; + } + return false; +} + +static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const struct ggml_tensor * op_tensor) { + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context; + if (op_tensor->op == GGML_OP_NONE) { + return true; + } + + if (!ggmlqnn_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) { + return false; + } + + struct ggml_tensor * src0 = op_tensor->src[0]; + struct ggml_tensor * src1 = op_tensor->src[1]; + const int64_t ne00 = src0->ne[0]; + const int src0_rank = ggml_n_dims(src0); + int src1_rank = 0; + if (nullptr != src1) { + src1_rank = ggml_n_dims(src1); + } + + switch (op_tensor->op) { + case GGML_OP_ADD: + case GGML_OP_SUB: + { + if (!ggml_are_same_shape(src0, src1)) { + return false; + } + + if (ne00 < 32) + return false; + + return ggmlhexagon_same_types(ctx, op_tensor); + } + + case GGML_OP_DIV: + case GGML_OP_MUL: { + if (ctx->device == HEXAGON_BACKEND_QNNNPU) + return false; + + if (!ggml_are_same_shape(src0, src1)) { + return false; + } + + if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix mul + return false; + + return ggmlhexagon_same_types(ctx, op_tensor); + } + case GGML_OP_MUL_MAT: + { + ggmlhexagon_dump_op_info(op_tensor); + if (src0_rank != src1_rank) // make QNN SDK happy + return false; + + if (src0_rank != 2) { + // FIXME: there are some limitations for mulmat in QNN SDK: rank >= 2. + return false; + } + + if (ctx->device == HEXAGON_BACKEND_QNNNPU) { + if (1 == g_hexagon_appcfg.enable_q_mulmat) { + return (src0->type == GGML_TYPE_F32 + || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0 + || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K + ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + } else { + return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32); + } + } else { + if (1 == g_hexagon_appcfg.enable_q_mulmat) { + return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) + && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + } else { + return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32); + } + } + } + case GGML_OP_LOG: + { + if (ctx->device == HEXAGON_BACKEND_QNNNPU) + return false; + } + case GGML_OP_SQRT: + default: + return ggmlhexagon_same_types(ctx, op_tensor); + } +} + +static bool ggmlhexagon_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) { + ggmlqnn_op_func_t func = nullptr; + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context; + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + ggmlhexagon_compute(ctx, dst); + return true; + } + + switch (dst->op) { + case GGML_OP_REPEAT: + ggmlqnn_compute_repeat(ctx, dst); + break; + case GGML_OP_GET_ROWS: + ggmlqnn_compute_get_rows(ctx, dst); + break; + case GGML_OP_DUP: + ggmlqnn_compute_dup(ctx, dst); + break; + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQRT: + case GGML_OP_LOG: + func = ggmlqnn_compute_elementwise; + break; + case GGML_OP_ACC: + ggmlqnn_compute_acc(ctx, dst); + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_GELU: + break; + case GGML_UNARY_OP_SILU: + break; + case GGML_UNARY_OP_GELU_QUICK: + break; + case GGML_UNARY_OP_TANH: + break; + case GGML_UNARY_OP_RELU: + break; + case GGML_UNARY_OP_HARDSIGMOID: + break; + case GGML_UNARY_OP_HARDSWISH: + break; + default: + return false; + } + break; + case GGML_OP_NORM: + ggmlqnn_compute_norm(ctx, dst); + break; + case GGML_OP_GROUP_NORM: + ggmlqnn_compute_group_norm(ctx, dst); + break; + case GGML_OP_CONCAT: + ggmlqnn_compute_concat(ctx, dst); + break; + case GGML_OP_UPSCALE: + ggmlqnn_compute_upsample_nearest2d(ctx, dst); + break; + case GGML_OP_PAD: + ggmlqnn_compute_pad(ctx, dst); + break; + case GGML_OP_ARANGE: + ggmlqnn_compute_arange(ctx, dst); + break; + case GGML_OP_TIMESTEP_EMBEDDING: + ggmlqnn_compute_timestep_embedding(ctx, dst); + break; + case GGML_OP_LEAKY_RELU: + ggmlqnn_compute_leaky_relu(ctx, dst); + break; + case GGML_OP_RMS_NORM: + ggmlqnn_compute_rms_norm(ctx, dst); + break; + case GGML_OP_MUL_MAT: + ggmlqnn_compute_mul_mat(ctx, dst); + break; + case GGML_OP_MUL_MAT_ID: + return false; + case GGML_OP_SCALE: + ggmlqnn_compute_scale(ctx, dst); + break; + case GGML_OP_SQR: + ggmlqnn_compute_sqr(ctx, dst); + break; + case GGML_OP_CLAMP: + ggmlqnn_compute_clamp(ctx, dst); + break; + case GGML_OP_CPY: + ggmlqnn_compute_cpy(ctx, dst); + break; + case GGML_OP_CONT: + ggmlqnn_compute_dup(ctx, dst); + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + case GGML_OP_SOFT_MAX: + ggmlqnn_compute_softmax(ctx, dst); + break; + case GGML_OP_ROPE: + ggmlqnn_compute_rope(ctx, dst); + break; + case GGML_OP_IM2COL: + ggmlqnn_compute_im2col(ctx, dst); + break; + case GGML_OP_POOL_2D: + ggmlqnn_compute_pool2d(ctx, dst); + break; + case GGML_OP_SUM_ROWS: + ggmlqnn_compute_sum_rows(ctx, dst); + break; + case GGML_OP_ARGSORT: + ggmlqnn_compute_argsort(ctx, dst); + break; + default: + return false; + } + + if (nullptr != func) + func(ctx, dst); + + return true; +} + +struct ggml_backend_hexagon_buffer_context { + ~ggml_backend_hexagon_buffer_context() { + if (buffer) { + if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + //do nothing here because rpc mempool was used for HWACCEL_CDSP + } else { + ggml_aligned_free(buffer, 0); + } + } + } + + void * buffer = nullptr; + size_t buffer_size = 0; + + struct ggml_backend_hexagon_context * backend_ctx = nullptr; +}; + +static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context; + delete ctx; +} + +static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context; + return ctx->buffer; +} + +static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context; + GGML_UNUSED(tensor); + GGML_UNUSED(ctx); + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, const void * data, + size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + +static void ggml_backend_hexagon_buffer_memset_tensor(ggml_backend_buffer_t buffer, + struct ggml_tensor * tensor, + uint8_t value, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memset((char *)tensor->data + offset, value, size); +} + +static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + +static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor * src, + struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + +static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context; + memset(ctx->buffer, value, ctx->buffer_size); +} + +static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = { + /* .free_buffer = */ ggml_backend_hexagon_buffer_free_buffer, + /* .get_base = */ ggml_backend_hexagon_buffer_get_base, + /* .init_tensor = */ ggml_backend_hexagon_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_hexagon_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor, + /* .clear = */ ggml_backend_hexagon_buffer_clear, + /* .reset = */ nullptr, +}; + +static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + return "hexagon-ion-buffer"; + } + + return "hexagon-normal-buffer"; +} + +static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( + ggml_backend_buffer_type_t buft, size_t size) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + struct ggml_backend_hexagon_context * ctx = static_cast(buft->context); + GGML_ASSERT(nullptr != ctx); + ggml_backend_hexagon_buffer_context * buffer_ctx = new ggml_backend_hexagon_buffer_context; + + size_t size_page = 0; +#if defined(__ANDROID__) || defined(__linux__) + size_page = sysconf(_SC_PAGESIZE); +#else + SYSTEM_INFO systeminfo; + GetSystemInfo(&systeminfo); + size_page = systeminfo.dwPageSize; +#endif + size_t size_aligned = size; + if (0 != (size_aligned % size_page)) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device)); + GGML_ASSERT(nullptr != ctx->rpc_mempool); + GGMLHEXAGON_LOG_DEBUG("size %ld(%d MiB), rpc_mempool_usage %ld(%d MiB), rpc_mempool_len %ld(%d MiB)", + size, size / SIZE_IN_MB, ctx->rpc_mempool_usage, ctx->rpc_mempool_usage / SIZE_IN_MB, + ctx->rpc_mempool_len, ctx->rpc_mempool_len / SIZE_IN_MB); + GGML_ASSERT(size + ctx->rpc_mempool_usage <= ctx->rpc_mempool_len); + buffer_ctx->buffer = (static_cast(ctx->rpc_mempool)) + ctx->rpc_mempool_usage; + GGMLHEXAGON_LOG_DEBUG("buffer_ctx->buffer %p", buffer_ctx->buffer); + GGML_ASSERT(nullptr != buffer_ctx->buffer); + ctx->rpc_mempool_usage += size_aligned; + } else { + buffer_ctx->buffer = ggml_aligned_malloc(size_aligned); + } + buffer_ctx->buffer_size = size_aligned; + if (nullptr == buffer_ctx->buffer) { + GGMLHEXAGON_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / SIZE_IN_MB); + return nullptr; + } else { + //GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / SIZE_IN_MB); + } + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, buffer_ctx, size); +} + +/** + * @param buft pointer to the buffer type context + * @return alignment requirement in bytes + */ +static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + return 128; + } else { + return 32; + } +} + +static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + struct ggml_backend_hexagon_context * ctx = static_cast(buft->context); + GGML_ASSERT(nullptr != ctx); + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + GGML_ASSERT(ctx->rpc_mempool_len > (8 * SIZE_IN_MB)); + return ctx->rpc_mempool_len - (8 * SIZE_IN_MB); + } else { + //TODO:this is an experimental value for LLM models + return (1024 * SIZE_IN_MB); + } +} + +static bool ggml_backend_buft_is_hexagon(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_hexagon_buffer_type_name; +} + +static bool ggml_backend_hexagon_buffer_is_host(ggml_backend_buffer_type_t buft) { + struct ggml_backend_hexagon_context * ctx = static_cast(buft->context); + GGML_ASSERT(nullptr != ctx); + GGML_UNUSED(ctx); + return true; +} + +static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *) backend->context; + return g_hexagon_mgr[ctx->device].name; +} + +static void ggml_backend_hexagon_free(ggml_backend_t backend) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context; + + qnn_instance * instance = (qnn_instance*)g_hexagon_mgr[ctx->device].instance; + if (nullptr != instance) { + for (auto & [graph_name, graph_res] : ctx->qnn_singlenode_graph_map) { + auto & graph_handle = std::get<0>(graph_res); + auto & ptensors = std::get<1>(graph_res); + for (auto & tensor : ptensors) { + ggmlqnn_free_qnntensor(tensor); + } + GGML_UNUSED(graph_handle); + GGMLHEXAGON_LOG_DEBUG("graph handle %p", graph_handle); + GGMLHEXAGON_LOG_DEBUG("clean up graph:%s", graph_name.c_str()); + } + + ctx->qnn_singlenode_graph_map.clear(); + + instance->qnn_finalize(); + delete instance; + g_hexagon_mgr[ctx->device].instance = nullptr; + } + + if (nullptr != g_hexagon_mgr[ctx->device].backend) { + //print timestamp and dsp information before deinit cdsp, useful for troubleshooting + ggmlhexagon_print_running_timestamp(ctx); + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + ggmlhexagon_deinit_cdsp(ctx); + } + + delete backend; + g_hexagon_mgr[ctx->device].backend = nullptr; + } + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); +} + +static enum ggml_status ggmlhexagon_backend_graph_compute_general(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context; + GGML_UNUSED(ctx); + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE + || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW + || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + bool ok = ggmlhexagon_compute_forward(backend, node); + if (!ok) { + GGMLHEXAGON_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + } + + return result; +} + +static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) { + struct ggml_backend_hexagon_context * ctx = static_cast(dev->context); + if (nullptr == ctx) { + GGMLHEXAGON_LOG_ERROR("pls check why ctx is null"); + return "unknown"; + } + return ctx->name; +} + +static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__); + struct ggml_backend_hexagon_context * ctx = static_cast(dev->context); + static char hexagon_device_desc[GGMLHEXAGON_TMPBUF_LEN]; + if (nullptr == ctx) { + GGMLHEXAGON_LOG_ERROR("pls check why ctx is null"); + return "unknown"; + } + + if (0 == strncmp(ctx->name, "qnn-npu", 7)) { + const char * soc_info = ggmlhexagon_get_socmodel_desc(ctx->socinfo.soc_model); + const char * htp_arch = ggmlhexagon_get_htparch_desc(ctx->socinfo.htp_arch); + std::string dev_desc = std::string(ctx->desc) + + std::string(soc_info) + "_" + std::string(htp_arch) + + "," + std::string(ctx->socinfo.soc_desc); + memset(hexagon_device_desc, 0, GGMLHEXAGON_TMPBUF_LEN); + memcpy(hexagon_device_desc, dev_desc.c_str(), strlen(dev_desc.c_str())); + return hexagon_device_desc; + } else { + return ctx->desc; + } +} + +static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + struct ggml_backend_hexagon_context * ctx = static_cast(dev->context); + if ((nullptr == ctx) || (ctx->device > HEXAGON_BACKEND_GGML)) { + GGMLHEXAGON_LOG_ERROR("pls check params"); + *free = 0; + *total = 0; + } + + if (HEXAGON_BACKEND_QNNCPU == ctx->device || HEXAGON_BACKEND_GGML == ctx->device) { + *total = ggmlhexagon_get_system_total_memory_in_bytes(); + *free = ggmlhexagon_get_system_free_memory_in_bytes(); + } else if (HEXAGON_BACKEND_QNNGPU == ctx->device) { + //TODO: probe GPU info in Qualcomm Adreno GPU + *total = ggmlhexagon_get_system_total_memory_in_bytes(); + *free = ggmlhexagon_get_system_free_memory_in_bytes(); + } else if (HEXAGON_BACKEND_QNNNPU == ctx->device) { + size_t rpc_ion_memsize = 0; + size_t rpc_ion_usage = 0; + GGML_ASSERT(nullptr != ctx->instance); + rpc_ion_memsize = ctx->instance->get_rpcmem_capacity(); + rpc_ion_usage = ctx->instance->get_rpcmem_usage(); + *total = rpc_ion_memsize; + *free = (rpc_ion_memsize - rpc_ion_usage); + GGMLHEXAGON_LOG_DEBUG("rpc memsize %d MiB", rpc_ion_memsize / SIZE_IN_MB); + GGMLHEXAGON_LOG_DEBUG("rpc usage %d MiB\n\n", rpc_ion_usage / SIZE_IN_MB); + } else if (HEXAGON_BACKEND_CDSP == ctx->device) { + size_t rpc_ion_memsize = 0; + size_t rpc_ion_usage = 0; + rpc_ion_memsize = ctx->rpc_mempool_capacity; + rpc_ion_usage = ctx->rpc_mempool_usage; + *total = rpc_ion_memsize; + *free = (rpc_ion_memsize - rpc_ion_usage); + GGMLHEXAGON_LOG_DEBUG("rpc memsize %d MiB", rpc_ion_memsize / SIZE_IN_MB); + GGMLHEXAGON_LOG_DEBUG("rpc usage %d MiB\n\n", rpc_ion_usage / SIZE_IN_MB); + } +} + +static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) { + struct ggml_backend_hexagon_context * ctx = static_cast(dev->context); + + if (HEXAGON_BACKEND_QNNCPU == ctx->device) + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + else if (HEXAGON_BACKEND_QNNGPU == ctx->device) + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + else if (HEXAGON_BACKEND_QNNNPU == ctx->device) + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + else if (HEXAGON_BACKEND_CDSP == ctx->device) + return GGML_BACKEND_DEVICE_TYPE_GPU; + else + return GGML_BACKEND_DEVICE_TYPE_CPU; +} + +static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev, + struct ggml_backend_dev_props * props) { + props->name = ggml_backend_hexagon_device_get_name(dev); + props->description = ggml_backend_hexagon_device_get_description(dev); + props->type = ggml_backend_hexagon_device_get_type(dev); + ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ true, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; + + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + //don't use system memory in this scenario + props->caps.host_buffer = false; + } +} + +static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(dev); + GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__); + int dev_index = 0; + + //case-1: test-backend-ops or other similar scenario: calling ggml_backend_dev_init(dev, reinterpret_cast(i)) directly in user's code + ggmlhexagon_load_cfg(); + if (!ggmlhexagon_check_valid_appcfg()) { + return nullptr; + } + + if (nullptr == params) { + GGMLHEXAGON_LOG_DEBUG("program specified param is nullptr"); + dev_index = (g_hexagon_appcfg.hexagon_backend > 0) ? g_hexagon_appcfg.hexagon_backend : 0; + if (dev_index >= GGML_HEXAGON_MAX_DEVICES) { + GGMLHEXAGON_LOG_INFO("assume the default ggml backend"); + return nullptr; + } + } else { + GGMLHEXAGON_LOG_VERBOSE("program specified param is not nullptr"); + //user's program calling ggml_backend_hexagon_device_init_backend directly + dev_index = (int)(intptr_t)params; + if (dev_index < 0) { + GGMLHEXAGON_LOG_VERBOSE("it shouldn't happend\n"); + //test-thread-safety might-be running at the moment or an invalid value passed from user's program + dev_index = HEXAGON_BACKEND_QNNCPU; //0 + } + if (dev_index > GGML_HEXAGON_MAX_DEVICES) { + dev_index = HEXAGON_BACKEND_GGML; //4 + } + g_hexagon_appcfg.hexagon_backend = dev_index; + GGMLHEXAGON_LOG_VERBOSE("program specified dev_index %d\n", dev_index); + } + GGMLHEXAGON_LOG_DEBUG("hexagon_backend=%d", dev_index); + ggml_backend_t hexagon_backend = ggml_backend_hexagon_init(dev_index, g_hexagon_appcfg.runtime_libpath); + GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__); + + return hexagon_backend; +} + +static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device_index) { + static std::mutex mutex; + std::lock_guard lock(mutex); + GGMLHEXAGON_LOG_DEBUG("enter %s, device_index %d", __func__, device_index); + if (device_index >= GGML_HEXAGON_MAX_DEVICES) { + GGMLHEXAGON_LOG_DEBUG("ggml_backend_hexagon_buffer_type error: device_index:%d is out of range [0, %d]\n", + device_index, GGML_HEXAGON_MAX_DEVICES - 1); + return nullptr; + } + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + //cover following special case: + // toggle back and forth frequently between cDSP and ggml in a standard Android APP or in + // a same running process + if (device_index != (size_t)(g_hexagon_appcfg.hexagon_backend)) { + GGMLHEXAGON_LOG_INFO("device_index %d, backend %d", device_index, g_hexagon_appcfg.hexagon_backend); + + g_hexagon_appcfg.hexagon_backend = device_index; + } + } + + static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_types[GGML_HEXAGON_MAX_DEVICES]; + static bool ggml_backend_hexagon_buffer_type_initialized = false; + if (!ggml_backend_hexagon_buffer_type_initialized) { + for (int i = 0; i < GGML_HEXAGON_MAX_DEVICES; i++) { + ggml_backend_hexagon_buffer_types[i] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_hexagon_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_hexagon_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_hexagon_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_hexagon_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .is_host = */ ggml_backend_hexagon_buffer_is_host + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), i), + /* .context = */ &g_hexagon_mgr[device_index], + }; + } + ggml_backend_hexagon_buffer_type_initialized = true; + } + + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend); + //FIXME:this is workaround for cover following special case: + // toggle back and forth frequently between cDSP and ggml in a standard Android APP or in a same running process + // there is unknown issue with this workaround when toggle back and forth frequently in a standard Android APP + int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]); + if (0 != result) { + GGMLHEXAGON_LOG_INFO("init hexagon dsp failure"); + return nullptr; + } + } + + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return &ggml_backend_hexagon_buffer_types[device_index]; +} + +static const char * ggml_backend_hexagon_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return "Hexagon_Host"; +} + +static const char * ggml_backend_hexagon_host_buffer_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "Hexagon_Host"; +} + +static void ggml_backend_hexagon_host_buffer_free(ggml_backend_buffer_t buffer) { + if (0 == g_hexagon_appcfg.enable_pinned_memory) { + ggml_aligned_free(buffer->context, 0); + } else { + rpcmem_free(buffer->context); + } +} + +static void * ggml_hexagon_host_malloc(ggml_backend_buffer_type_t buft, size_t size) { + if (0 == g_hexagon_appcfg.enable_pinned_memory) { + return ggml_aligned_malloc(size); + } else { + //TODO: there are no corresponding APIs in existing Hexagon SDK, here try to re-use camera ion heap as a pinned memory + return rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, ION_CAMERA_HEAP_ID | RPCMEM_TRY_MAP_STATIC, size); + } +} + +static ggml_backend_buffer_t ggml_backend_hexagon_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * host_ptr = ggml_hexagon_host_malloc(buft, size); + + if (nullptr == host_ptr) { + GGMLHEXAGON_LOG_INFO("failed to alloc host buffer"); + //TODO: use assertion here before find a better approach to release "correct" host buffer + // in function ggml_backend_hexagon_host_buffer_free + GGML_ASSERT(nullptr != host_ptr); + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } else { + GGMLHEXAGON_LOG_INFO("succeed to alloc host buffer %d MiB", size / SIZE_IN_MB); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(host_ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_hexagon_host_buffer_free; + + return buffer; +} + +static ggml_backend_buffer_type_t ggml_backend_hexagon_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_hexagon_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_hexagon_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ nullptr, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), 0), + /* .context = */ nullptr, + }; + + return &ggml_backend_hexagon_buffer_type_host; +} + +static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_hexagon_host_buffer_type(); +} + +static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context; + return ggml_backend_hexagon_buffer_type(ctx->device); +} + +static ggml_backend_buffer_t ggml_backend_hexagon_device_buffer_from_host_ptr(ggml_backend_dev_t dev, + void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); +} + +static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + if (ggml_backend_buft_is_hexagon(buft)) { + ggml_backend_hexagon_context * dev_ctx = (ggml_backend_hexagon_context *)dev->context; + ggml_backend_hexagon_context * buft_ctx = (ggml_backend_hexagon_context *)buft->context; + return buft_ctx->device == dev_ctx->device; + } + } + + return ggml_backend_buft_is_host(buft); +} + +static struct ggml_backend_device_i ggml_backend_hexagon_device_interface = { + /* .get_name = */ ggml_backend_hexagon_device_get_name, + /* .get_description = */ ggml_backend_hexagon_device_get_description, + /* .get_memory = */ ggml_backend_hexagon_device_get_memory, + /* .get_type = */ ggml_backend_hexagon_device_get_type, + /* .get_props = */ ggml_backend_hexagon_device_get_props, + /* .init_backend = */ ggml_backend_hexagon_device_init_backend, + /* .get_buffer_type = */ ggml_backend_hexagon_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_hexagon_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ ggml_backend_hexagon_device_buffer_from_host_ptr, + /* .supports_op = */ nullptr, + /* .supports_buft = */ ggml_backend_hexagon_device_supports_buft, + /* .offload_op = */ nullptr, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + +static ggml_backend_i ggml_backend_hexagon_interface = { + /* .get_name = */ ggml_backend_hexagon_name, + /* .free = */ ggml_backend_hexagon_free, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_update = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, +}; + +//FIXME: this guid is not make sense +static ggml_guid_t ggml_backend_hexagon_guid() { + static ggml_guid guid = { + 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 + }; + return &guid; +} + +bool ggml_backend_is_hexagon(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_hexagon_guid()); +} + +static void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_hexagon(backend)); + + struct ggml_backend_hexagon_context * ctx = (struct ggml_backend_hexagon_context *)backend->context; + ctx->n_threads = n_threads; +} + +int ggml_backend_hexagon_get_device_count() { + if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) { + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //so return 1 + return 1; + } else { + //QNN-CPU, QNN-GPU, QNN-NPU + return GGML_HEXAGON_MAX_DEVICES - 1; + } +} + +struct ggml_backend_hexagon_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + //return "ggml-hexagon"; + + //return accurate backend name rather than "ggml-hexagon" to + //make compare NPU performance through llama-bench more clear + if (HEXAGON_BACKEND_QNNNPU == g_hexagon_appcfg.hexagon_backend) + return "QNN-NPU"; + + if (HEXAGON_BACKEND_QNNGPU == g_hexagon_appcfg.hexagon_backend) + return "QNN-GPU"; + + if (HEXAGON_BACKEND_QNNCPU == g_hexagon_appcfg.hexagon_backend) + return "QNN-CPU"; + + if (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend) + return "Hexagon-cDSP"; + + return "ggml"; +} + +static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP); + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //so return 1 + return 1; + } else { + //QNN-CPU, QNN-GPU, QNN-NPU + return GGML_HEXAGON_MAX_DEVICES - 1; + } +} + +static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_UNUSED(reg); + GGML_UNUSED(index); + + GGMLHEXAGON_LOG_DEBUG("index %d", index); + ggml_backend_hexagon_reg_context * ctx = (ggml_backend_hexagon_reg_context *)reg->context; + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP); + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //so return ctx->devices[0] + return ctx->devices[0]; + } else { + GGML_ASSERT(index <= ctx->devices.size()); + return ctx->devices[index]; + } +} + +static void * ggml_backend_hexagon_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + GGML_UNUSED(reg); + + if (nullptr == name) + return nullptr; + + const char * slot_name = "ggml_backend_set_n_threads"; + if (0 == memcmp(name, slot_name, strlen(slot_name))) { + return (void *)ggml_backend_hexagon_set_n_threads; + } + + return nullptr; +} + +static const ggml_backend_reg_i ggml_backend_hexagon_reg_interface = { + /* .get_name = */ ggml_backend_hexagon_reg_get_name, + /* .get_device_count = */ ggml_backend_hexagon_reg_get_device_count, + /* .get_device = */ ggml_backend_hexagon_reg_get_device, + /* .get_proc_address = */ ggml_backend_hexagon_reg_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_hexagon_reg() { + static ggml_backend_reg reg; + //TODO: the existing codes can't cover following special case: + // toggle back and forth frequently between QNN-NPU and cDSP and ggml in a standard Android APP or in + // a same running process + // supportive of such special case is easy but it will significantly increase the size of APK + static bool initialized = false; + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__); + + //case-2: normal scenario, such as llama-cli or UI applicaton + ggmlhexagon_load_cfg(); + if (!ggmlhexagon_check_valid_appcfg()) { + return nullptr; + } + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + ggml_backend_hexagon_reg_context * ctx = new ggml_backend_hexagon_reg_context; + + for (int i = 0; i < ggml_backend_hexagon_get_device_count(); i++) { + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_cdsp; + } else { + ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_qnn; + } + + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + if (0 == g_hexagon_appcfg.enable_pinned_memory) { + //don't use system memory in this scenario + ggml_backend_hexagon_device_interface.get_host_buffer_type = nullptr; + } + } + + GGMLHEXAGON_LOG_DEBUG("create backend device for device %d", i); + ggml_backend_dev_t dev = new ggml_backend_device{ + /* .iface = */ ggml_backend_hexagon_device_interface, + /* .reg = */ ®, + /* .context = */ &g_hexagon_mgr[i] + }; + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0] + //attention here: + dev->context = &g_hexagon_mgr[HEXAGON_BACKEND_CDSP]; + } + + ctx->devices.push_back(dev); + + //make cDSP rpc memory pool happy because ggml's backend subsystem need this + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend); + int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]); + if (0 != result) { + GGMLHEXAGON_LOG_INFO("init hexagon dsp failure"); + return nullptr; + } + //GGML_ASSERT(0 == result); + } + } + + reg = ggml_backend_reg { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_hexagon_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } + GGMLHEXAGON_LOG_DEBUG("leave ggml_backend_hexagon_reg"); + + return ® +} + +const char * ggml_backend_hexagon_get_devname(size_t dev_num) { + switch (dev_num) { + case HEXAGON_BACKEND_QNNCPU: + return "HEXAGON_BACKEND_QNN_CPU"; + case HEXAGON_BACKEND_QNNGPU: + return "HEXAGON_BACKEND_QNN_GPU"; + case HEXAGON_BACKEND_QNNNPU: + return "HEXAGON_BACKEND_QNN_NPU"; + case HEXAGON_BACKEND_CDSP: + return "HEXAGON_BACKEND_CDSP"; + case HEXAGON_BACKEND_GGML: + return "ggml"; //"fake" hexagon backend, used for compare performance between hexagon backend and the default ggml backend + default: + return "unknown"; + } +} + +static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) { + int result = 0; + GGMLHEXAGON_LOG_VERBOSE("device=%d, hwaccel approach=%d(%s)", device, g_hexagon_appcfg.hwaccel_approach, + ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); + + qnn_instance * instance = nullptr; + instance = new qnn_instance(qnn_lib_path, g_hexagon_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (0 != result) { + GGMLHEXAGON_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", + ggml_backend_hexagon_get_devname(device)); + delete instance; + return nullptr; + } + qnn_interface qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface.is_loaded()) { + GGMLHEXAGON_LOG_WARN("qnn subsystem failure\n"); + delete instance; + return nullptr; + } + + std::string device_name = ggml_backend_hexagon_get_devname(device); + GGMLHEXAGON_LOG_VERBOSE("qnn device name %s", device_name.c_str()); + g_hexagon_mgr[device].instance = instance; + g_hexagon_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_hexagon_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + + return instance; +} + +/** + * + * @param device 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU 3: HEXAGON_BACKEND_CDSP 4: ggml + * @param runtime_libpath binary runtime library path, such as "/data/local/tmp/" on Android or specified in user's code + * @return + */ +ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_libpath) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__); + if (nullptr == runtime_libpath) + return nullptr; + + //case-3: calling ggml_backend_hexagon_init() directly in user's code + ggmlhexagon_load_cfg(); + if (!ggmlhexagon_check_valid_appcfg()) { + return nullptr; + } + + GGMLHEXAGON_LOG_DEBUG("device %d", device); + GGMLHEXAGON_LOG_DEBUG("runtime libpath %s", runtime_libpath); + if (device >= GGML_HEXAGON_MAX_DEVICES) { + GGMLHEXAGON_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + if (0 != memcmp(runtime_libpath, g_hexagon_appcfg.runtime_libpath, strlen(g_hexagon_appcfg.runtime_libpath))) { + //re-setting runtime libpath + ggmlhexagon_set_runtime_path(device, runtime_libpath); + } + + if (nullptr != g_hexagon_mgr[device].backend) { + GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device, + ggml_backend_hexagon_get_devname(device)); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__); + return g_hexagon_mgr[device].backend; + } + + //don't initialize QNN when hwaccel approach is offload ggml op to Hexagon cDSP directly + if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) { + qnn_instance * instance = ggmlqnn_init_qnn_instance(device, runtime_libpath); + if (nullptr == instance) + return nullptr; + } + ggml_backend_hexagon_interface.graph_compute = ggmlhexagon_backend_graph_compute_general; + ggml_backend_t hexagon_backend = new ggml_backend{ + /* .guid = */ ggml_backend_hexagon_guid(), + /* .iface = */ ggml_backend_hexagon_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), device), + /* .context = */ &g_hexagon_mgr[device] + }; + + g_hexagon_mgr[device].backend = hexagon_backend; + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[device]); + if (0 != result) { + GGMLHEXAGON_LOG_INFO("init hexagon dsp failure"); + ggml_backend_hexagon_free(hexagon_backend); + return nullptr; + } + } else { + //get fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU + GGMLHEXAGON_LOG_VERBOSE("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device)); + } + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__); + + return hexagon_backend; +} + +GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg) diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile new file mode 100755 index 0000000000000..7ac0ba5dbd754 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/Makefile @@ -0,0 +1,53 @@ +#following vars already defined in CMakeLists.txt +#HTP_ARCH_VERSION=v79 +#DEBUG_FLAG=-DNDEBUG -Wall +#HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1 + +HEXAGON_COMPUTE=compute${HTP_ARCH_VERSION} +HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang +HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang + +TARGET=libggmldsp-skel.so + +$(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH}) +$(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION}) +$(info DEBUG_FLAG:${DEBUG_FLAG}) +$(info HEXAGON_COMPUTE:${HEXAGON_COMPUTE}) + +INCS=-I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/ + +CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B ${INCS} -fno-finite-math-only + +LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET} + +#SRCS = $(wildcard *.c) +SRCS = ggml-dsp.c skel.c entry.c add.c mulmat.c +OBJS = $(patsubst %.c, %.o, $(SRCS)) +OBJS += dot.o +OBJS += worker_pool.o + +ALL:$(OBJS) + ${HEXAGON_CC} ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group + @ls -l ${TARGET} + /bin/cp -fv ${TARGET} ../../../../out/android/bin/ + /bin/cp -fv ${TARGET} ../../../../out/android/bin/libggmldsp-skel${HTP_ARCH_VERSION}.so + /bin/rm -f *.so + +%.o:%.c + @echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<" + ${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $< + @echo "\n" + +%.o:%.S + @echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<" + ${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $< + @echo "\n" + +%.o:%.cpp + @echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<" + ${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $< + @echo "\n" + +clean: + rm -f *.o + /bin/rm -f *.so diff --git a/ggml/src/ggml-hexagon/kernels/add.c b/ggml/src/ggml-hexagon/kernels/add.c new file mode 100644 index 0000000000000..25a2d73e23536 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/add.c @@ -0,0 +1,143 @@ +#include "ggml-dsp.h" + +static inline void l2fetch(const void * p, uint32_t stride, + uint32_t width, uint32_t height, + uint32_t dir) { + uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height); + __asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control)); +} + +static inline void ggmlhexagon_dsp_add_f32(const int n, float * GGML_RESTRICT z, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) { + HVX_Vector * va; + HVX_Vector * vb; + HVX_Vector * vc; + HVX_Vector qf32; + const size_t FLOATS_PER_VECTOR = 128 / sizeof(float); + const size_t block = n / FLOATS_PER_VECTOR; + const size_t left = n % FLOATS_PER_VECTOR; + const size_t blocks = block * FLOATS_PER_VECTOR; + + if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) { + GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y); + for (size_t i = 0; i < n; ++i) + z[i] = x[i] + y[i]; + + return; + } + + va = (HVX_Vector *)x; + vb = (HVX_Vector *)y; + vc = (HVX_Vector *)z; + //unroll is better but need more carefully check for various cases and I think DSP also don't like branch predication + for (size_t i = 0; i < block; ++i) { + l2fetch(va + VLEN, VLEN, VLEN, 1, 0); + l2fetch(vb + VLEN, VLEN, VLEN, 1, 0); + //*vc++ = Q6_Vsf_vadd_VsfVsf(*va++, *vb++); + qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++); + *vc++ = Q6_Vsf_equals_Vqf32(qf32); + } + + if (left > 0) { + for (size_t i = 0; i < left; ++i) + z[i + blocks] = x[i + blocks] + y[i + blocks]; + } +} + +static void ggml_compute_forward_add_f32( + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + uint64_t start_time = ggml_time_us(); + + memcpy(dst->ne, src1->ne, 16); + memcpy(dst->nb, src1->nb, 16); + ggmlhexagon_dump_tensor(src0, 1); + ggmlhexagon_dump_tensor(src1, 1); + ggmlhexagon_dump_tensor(dst, 1); + + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + const int rank = ggml_n_dims(src0); + if (1 == rank) { + //element-wise addition with vector + const size_t len = src0->ne[0]; + float * dst_ptr = (float *) (dst->data); + float * src0_ptr = (float *) (src0->data); + float * src1_ptr = (float *) (src1->data); + ggmlhexagon_dsp_add_f32(len, dst_ptr, src0_ptr, src1_ptr); + return; + } + + const int ith = 0; + const int nth = 1; + + const int nr = ggml_nrows(src0); + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + const int dr = (nr + nth - 1)/nth; + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + if (nb10 == sizeof(float)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int32_t i03 = ir/(ne02*ne01); + const int32_t i02 = (ir - i03*ne02*ne01)/ne01; + const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int32_t i13 = i03 % ne13; + const int32_t i12 = i02 % ne12; + const int32_t i11 = i01 % ne11; + const int32_t nr0 = ne00 / ne10; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + for (int32_t r = 0; r < nr0; ++r) { + ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); + } + } + } else { + // src1 is not contiguous + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int32_t i03 = ir/(ne02*ne01); + const int32_t i02 = (ir - i03*ne02*ne01)/ne01; + const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int32_t i13 = i03 % ne13; + const int32_t i12 = i02 % ne12; + const int32_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int32_t i0 = 0; i0 < ne0; ++i0) { + const int32_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); + + dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; + } + } + } + + uint64_t end_time = ggml_time_us(); + uint64_t duration = (end_time - start_time); + GGMLHEXAGON_LOG_DEBUG("duration %llu us", duration); +#if !GGMLHEXAGON_DEBUG + UNUSED(duration); +#endif + + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); +} + +//FIXME: why failed with test-backend-ops when disable ion rpc mempool +int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__); + ggml_compute_forward_add_f32(src0, src1, dst); + GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__); + return 0; +} diff --git a/ggml/src/ggml-hexagon/kernels/dot.S b/ggml/src/ggml-hexagon/kernels/dot.S new file mode 100755 index 0000000000000..2031a6001519b --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/dot.S @@ -0,0 +1,136 @@ +/**============================================================================= +@file + qhblas_f_vector_dot_af.S + +@brief + Calculates dot product of two input float vectors. + + Function prototype + + int32_t qhblas_f_vector_dot_af(float_a8_t *input_1, float_a8_t *input_2, float *output, uint32_t size); + + Reference C code + + int32_t qhblas_f_vector_dot_af(float_a8_t *input_1, float_a8_t *input_2, float *output, uint32_t size) + { + if ((input_1 == NULL) || (input_2 == NULL) || (output == NULL) || (size == 0)) + { + return -1; + } + + float dot = 0; + for (uint32_t i = 0; i < size; ++i) + { + dot += input_1[i] * input_2[i]; + } + + *output = dot; + return 0; + } + +Copyright (c) 2019 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ + +/*============================================================================*/ + + .p2align 2 + .p2align 4,,15 + .global qhblas_f_vector_dot_af + .type qhblas_f_vector_dot_af, @function + +/*============================================================================*/ + +#define DC_PREFETCH_AHEAD 64 // number of bytes for DCFETCH +#define L2_PREFETCH_AHEAD 256 // number of bytes for L2FETCH +#define L2FETCH_CONFIG 0x0100FF00+(L2_PREFETCH_AHEAD/256) // [stride = 256 : width = 255 : height = bytes/256] +#define L2_PREFETCH_ELEMS L2_PREFETCH_AHEAD/8 // number of elements to prefetch with L2FETCH + +/*============================================================================*/ + +qhblas_f_vector_dot_af: +{ + p0 = !cmp.eq(r0,#0) // input_1 != NULL + p0 = !cmp.eq(r1,#0) // input_2 != NULL + p0 = !cmp.eq(r2,#0) // output != NULL + p0 = cmp.gtu(r3,#0) // size > 0 + if (!p0.new) jump:nt .L_ret +} +{ + r10 = #0 + r3 = lsr(r3,#1) // size / 2 + p1 = tstbit(r3,#0) // check for odd size + if(cmp.eq(r3.new,#0)) jump:nt .L_do_one +} +{ + r7:6 = #0 + r9:8 = #0 + r5 = add(r3,#7) // (size / 2) + 7 + p2 = cmp.gtu(r3,#L2_PREFETCH_ELEMS) // check whether we can do l2fetch +} +{ + r5 = lsr(r5,#3) // ceil(size / 2) + r14 = mux(p2,r3,#0) // set l2fetch counter +} +{ + r13:12 = combine(##L2FETCH_CONFIG,#8) // set l2fetch config and max number of iterations for .L_loop_do_two + loop1(.L_prefetch_loop_do_two,r5) +} + .falign +.L_prefetch_loop_do_two: +{ + dcfetch(r0+#DC_PREFETCH_AHEAD) // prefetch ahead for input_1 + r5 = min(r12,r3) // min(8, size / 2) +} +{ + dcfetch(r1+#DC_PREFETCH_AHEAD) // prefetch ahead for input_2 + loop0(.L_loop_do_two,r5) + p2 = cmp.eq(r3,r14) // check whether to do l2fetch + if (!p2.new) jump:t .L_loop_do_two +} +{ + r5 = add(r3,#-L2_PREFETCH_ELEMS) // number of elements left to prefetch ahead + r15 = add(r0,#L2_PREFETCH_AHEAD) // input_1 addr for l2fetch +} +{ + p2 = cmp.gtu(r5,#L2_PREFETCH_ELEMS) // check whether we can continue to do l2fetch + r15 = add(r1,#L2_PREFETCH_AHEAD) // input_2 addr for l2fetch + l2fetch(r15,r13) +} +{ + if (p2) r14 = add(r14,#-L2_PREFETCH_ELEMS) // adjust l2fetch counter + if (!p2) r14 = #0 // there are no more bytes left to prefetch ahead + l2fetch(r15,r13) +} + .falign +.L_loop_do_two: +{ + r7:6 = memd(r0++#8) + r9:8 = memd(r1++#8) + r10 += sfmpy(r7,r9) +} +{ + r10 += sfmpy(r6,r8) + r3 = add(r3,#-1) // adjust (size / 2) +}:endloop0:endloop1 +{ + r10 += sfmpy(r7,r9) + if (!p1) jump:nt .L_ret +} + .falign +.L_do_one: +{ + r4 = memw(r0) + r5 = memw(r1) +} +{ + r10 += sfmpy(r4,r5) +} + .falign +.L_ret: +{ + if (p0) memw(r2) = r10 + r0 = mux(p0,#0,#-1) + jumpr r31 +} + .size qhblas_f_vector_dot_af, .-qhblas_f_vector_dot_af diff --git a/ggml/src/ggml-hexagon/kernels/entry.c b/ggml/src/ggml-hexagon/kernels/entry.c new file mode 100644 index 0000000000000..8af93ea1d3082 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/entry.c @@ -0,0 +1,115 @@ +#include "ggml-dsp.h" + +static int32 g_thread_counts = 1; + +int ggmlop_dsp_open(const char * uri, remote_handle64 * handle) { + void * tptr = NULL; + GGMLHEXAGON_LOG_DEBUG("uri %s", uri); + tptr = (void *)malloc(1); + GGML_ASSERT(NULL != tptr); + *handle = (remote_handle64)tptr; + + GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version()); + GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units()); + qurt_arch_version_t vers; + qurt_sysenv_get_arch_version(&vers); + GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version); + + qurt_sysenv_app_heap_t aheap; + qurt_sysenv_get_app_heap(&aheap); + GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit); + + qurt_sysenv_max_hthreads_t mhwt; + qurt_sysenv_get_max_hw_threads(&mhwt); + GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads); + g_thread_counts = mhwt.max_hthreads; + + return 0; +} + +int ggmlop_dsp_close(remote_handle64 handle) { + if (handle) + free((void*)handle); + + return 0; +} + +AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 mulmat_algo, int32 thread_counts) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__); + HAP_power_request_t request; + memset(&request, 0, sizeof(HAP_power_request_t)); + request.type = HAP_power_set_apptype; + request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS; + + GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts); + if (thread_counts > 1) + g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts; + else + g_thread_counts = 1; + GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts); + + void * ggmop_ctx = (void*)(handle); + int retval = HAP_power_set(ggmop_ctx, &request); + if (retval) { + GGMLHEXAGON_LOG_DEBUG("failed first power vote"); + return AEE_EFAILED; + } + + //configure clocks & DCVS mode + memset(&request, 0, sizeof(HAP_power_request_t)); + request.type = HAP_power_set_DCVS_v2; + request.dcvs_v2.dcvs_enable = TRUE; + request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level; + if (mulmat_algo) { + request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE; + request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE; + } else { + request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner; + request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner; + } + request.dcvs_v2.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; + request.dcvs_v2.set_dcvs_params = TRUE; + request.dcvs_v2.set_latency = TRUE; + request.dcvs_v2.latency = latency; + retval = HAP_power_set(ggmop_ctx, &request); + if (retval) { + GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode"); + return AEE_EFAILED; + } + + memset(&request, 0, sizeof(HAP_power_request_t)); + request.type = HAP_power_set_HVX; + request.hvx.power_up = TRUE; + retval = HAP_power_set(ggmop_ctx, &request); + if (retval) { + GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power"); + return AEE_EFAILED; + } + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return AEE_SUCCESS; +} + +// ================================================================================================= +// implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file +// ================================================================================================= +int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} + +int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} + +int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} + +int ggmlop_get_thread_counts(void) { + return g_thread_counts; +} diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c new file mode 100644 index 0000000000000..b64209971a0dc --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2025 The ggml authors + * + * Qualcomm Hexagon SDK and reference tech guides could be found at: + * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools + * + * this single-source-file or self-contained file is implementation of ggml-dsp: + * - a customized tiny ggml running on Qualcomm Hexagon cDSP + * - ported from original ggml + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include "ggml-dsp.h" + +void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) { +#if !GGMLHEXAGON_DEBUG + return; +#endif + static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN]; + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggmlhexagon_log_internal_buf, GGMLHEXAGON_LOGBUF_LEN, "[%s, %d]: ", + func, line); + int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix, + GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) { + FARF(ALWAYS, "%s\n", s_ggmlhexagon_log_internal_buf); + } + va_end(args); +} + +void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) { +#if !GGMLHEXAGON_DEBUG + return; +#endif + float value = 0; + char tmpbuf[GGMLHEXAGON_LOGBUF_LEN]; + size_t buflen = 0; + if (tensor->type == GGML_TYPE_F32) { + memset(tmpbuf, 0, GGMLHEXAGON_LOGBUF_LEN); + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + buflen += snprintf(tmpbuf + buflen, GGMLHEXAGON_LOGBUF_LEN - buflen, "%-4.2f\t", value); + } + buflen += snprintf(tmpbuf + buflen, GGMLHEXAGON_LOGBUF_LEN - buflen, "\n"); + } + } + } + GGMLHEXAGON_LOG_DEBUG("\n%s\n", tmpbuf); + } + + GGMLHEXAGON_LOG_DEBUG("\n"); +} + +void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data) { + GGMLHEXAGON_LOG_DEBUG("ne = %5d x %5d x %5d x %5d , nb = (%5zi, %5zi, %5zi, %5zi)\n", + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], + tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]); + + if ((1 == dump_tensor_data) && (ggml_nbytes(tensor) < 320)) { + ggmlhexagon_dump_tensor_elements(tensor); + } +} + +size_t ggml_row_size(enum ggml_type type, int64_t ne) { + return 4*ne; +} + +size_t ggml_nbytes(const struct ggml_tensor * tensor) { + size_t nbytes; + const size_t blck_size = 1; + if (blck_size == 1) { + nbytes = 4; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } else { + nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } + + return nbytes; +} + +bool ggml_is_empty(const struct ggml_tensor * tensor) { + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + if (tensor->ne[i] == 0) { + return true; + } + } + return false; +} + +bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return ggml_is_empty(t0) ? ggml_is_empty(t1) : + (t1->ne[0]%t0->ne[0] == 0) && + (t1->ne[1]%t0->ne[1] == 0) && + (t1->ne[2]%t0->ne[2] == 0) && + (t1->ne[3]%t0->ne[3] == 0); +} + +bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + return + (t0->ne[0] == t1->ne[0]) && + (t0->ne[1] == t1->ne[1]) && + (t0->ne[2] == t1->ne[2]) && + (t0->ne[3] == t1->ne[3]); +} + +int64_t ggml_nrows(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; +} + +bool ggml_is_transposed(const struct ggml_tensor * tensor) { + return tensor->nb[0] > tensor->nb[1]; +} + +bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) { + size_t next_nb = 4; + if (tensor->ne[0] != 1 && tensor->nb[0] != next_nb) { + return false; + } + next_nb *= tensor->ne[0]; + for (int i = 1; i < GGML_MAX_DIMS; i++) { + if (tensor->ne[i] != 1) { + if (i > n) { + if (tensor->nb[i] != next_nb) { + return false; + } + next_nb *= tensor->ne[i]; + } else { + // this dimension does not need to be contiguous + next_nb = tensor->ne[i]*tensor->nb[i]; + } + } + } + return true; +} + +int64_t ggml_nelements(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; +} + +static bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) { + return ggml_is_contiguous_n(tensor, 0); +} + +bool ggml_is_contiguous(const struct ggml_tensor * tensor) { + return ggml_is_contiguous_0(tensor); +} + +int ggml_n_dims(const struct ggml_tensor * tensor) { + for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) { + if (tensor->ne[i] > 1) { + return i + 1; + } + } + return 1; +} + +void ggml_abort(const char * file, int line, const char * fmt, ...) { + GGMLHEXAGON_LOG_DEBUG("enter ggml_abort"); + abort(); +} + +static inline uint64 hexagon_perf_get_time_us(void) { + unsigned long long count; + asm volatile (" %0 = c31:30 " : "=r"(count)); + return (uint64)(count) * 10ull / 192ull; +} + +int64_t ggml_time_ms(void) { + return hexagon_perf_get_time_us() * 1000; +} + +int64_t ggml_time_us(void) { + return hexagon_perf_get_time_us(); +} diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h new file mode 100644 index 0000000000000..103b46b8ee7fc --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h @@ -0,0 +1,168 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "HAP_perf.h" +#include "HAP_farf.h" +#include "HAP_power.h" +#include "HAP_vtcm_mgr.h" +#include "HAP_compute_res.h" + +#include "qurt.h" +#include "AEEStdErr.h" +#include "hexagon_types.h" +#include "hexagon_protos.h" + +#include "skel.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ggml_tensor dsptensor + +#define GGML_MAX_DIMS 4 + +#define ALIGN_128_BYTE 128 + +#define VLEN 128 + +#define GGML_UNUSED(x) (void)(x) + +#define UNUSED GGML_UNUSED + +#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) + +#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) + +#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x) + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#if UINTPTR_MAX == 0xFFFFFFFF +#define GGML_MEM_ALIGN 4 +#else +#define GGML_MEM_ALIGN 16 +#endif + +#define GGML_API extern + +#ifdef __cplusplus +// restrict not standard in C++ +# if defined(__GNUC__) +# define GGML_RESTRICT __restrict__ +# elif defined(__clang__) +# define GGML_RESTRICT __restrict +# elif defined(_MSC_VER) +# define GGML_RESTRICT __restrict +# else +# define GGML_RESTRICT +# endif +#else +# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L) +# define GGML_RESTRICT __restrict +# else +# define GGML_RESTRICT restrict +# endif +#endif + +#ifndef __cplusplus +#ifndef static_assert + #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) + #define static_assert(cond, msg) _Static_assert(cond, msg) + #else + #define static_assert(cond, msg) struct global_scope_noop_trick + #endif +#endif +#endif // __cplusplus + + +//NPU performance will be slower when enable GGMLHEXAGON_DEBUG +#ifdef NDEBUG +#define GGMLHEXAGON_DEBUG 0 +#else +#define GGMLHEXAGON_DEBUG 1 +#endif + +#define GGMLHEXAGON_LOGBUF_LEN 4096 +#define GGMLHEXAGON_TMPBUF_LEN 256 +#if GGMLHEXAGON_DEBUG +#define GGMLHEXAGON_LOG_DEBUG(...) ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define GGMLHEXAGON_LOG_DEBUG(...) +#endif + +#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ + const type prefix##0 = (pointer)->array[0]; \ + GGML_UNUSED(prefix##0); +#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ + const type prefix##1 = (pointer)->array[1]; \ + GGML_UNUSED(prefix##1); +#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ + const type prefix##2 = (pointer)->array[2]; \ + GGML_UNUSED(prefix##2); +#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ + const type prefix##3 = (pointer)->array[3]; \ + GGML_UNUSED(prefix##3); + +#define GGML_TENSOR_UNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_TENSOR_BINARY_OP_LOCALS01 \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + +enum ggmlhexagon_log_level { + GGMLHEXAGON_LOG_LEVEL_NONE = 0, + GGMLHEXAGON_LOG_LEVEL_DEBUG = 1, +}; + +enum ggml_type { + GGML_TYPE_F32 = 0, +}; + +typedef double ggml_float; + +GGML_API int64_t ggml_time_ms(void); +GGML_API int64_t ggml_time_us(void); + +GGML_API size_t ggml_nbytes(const struct ggml_tensor * tensor); +GGML_API int64_t ggml_nrows(const struct ggml_tensor * tensor); +GGML_API int ggml_n_dims(const struct ggml_tensor * tensor); +GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); +GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); +GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1); +GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + +GGML_API void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor); +GGML_API void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data); +GGML_API void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...); + +GGML_API int ggmlop_get_thread_counts(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c new file mode 100644 index 0000000000000..f34b6f8b09b4e --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/mulmat.c @@ -0,0 +1,288 @@ +#include "ggml-dsp.h" + +// 128 byte vectors +#define VSIZE_BYTES 128 +#define VSIZE_WORDS VSIZE_BYTES/4 + +union ui32f { int32_t i; float f; }; + +// create a vector of floats from a float +static __attribute__((always_inline)) HVX_Vector create_sfv_from_sf(float value) { + union ui32f cvt; + cvt.f = value; + HVX_Vector tmp = Q6_V_vsplat_R(cvt.i); + return tmp; +} + +// create a vector of qf32's from a float +static __attribute__((always_inline)) HVX_Vector create_qf32v_from_sf(float value) { + HVX_Vector tmp = Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_vsplat_R(0), create_sfv_from_sf(value)); + return tmp; +} + +// convert qf32 vector to float vector +static __attribute__((always_inline)) HVX_Vector convert_qf32v_to_fltv(HVX_Vector vect) { + HVX_Vector tmp = Q6_Vsf_equals_Vqf32(vect); + return tmp; +} + +// get lowest float from a vector of floats +static __attribute__((always_inline)) float get_flt0_from_fltv(HVX_Vector vect) { + union ui32f cvt; + cvt.i = vect[0]; + return cvt.f; +} + +// get lowest float from a vector of qf32's +static __attribute__((always_inline)) float get_flt0_from_qf32v(HVX_Vector vect) { + union ui32f cvt; + HVX_Vector tmp = convert_qf32v_to_fltv(vect); + cvt.i = tmp[0]; + return cvt.f; +} + +static void vec_dot_f32(int n, float *GGML_RESTRICT s, size_t bs, const float *GGML_RESTRICT x, + size_t bx, const float *GGML_RESTRICT y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + // scalar + ggml_float sumf = 0.0; + for (int i = 0; i < n; ++i) { + sumf += (ggml_float) (x[i] * y[i]); + } + *s = sumf; +} + +static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, const ggml_tensor *src1, + struct ggml_tensor *dst, + const enum ggml_type type, + const int32_t num_rows_per_vec_dot, + const int32_t ir0_start, const int32_t ir0_end, + const int32_t ir1_start, const int32_t ir1_end) { + ggmlhexagon_dump_tensor(src0, 0); + ggmlhexagon_dump_tensor(src1, 0); + ggmlhexagon_dump_tensor(dst, 0); + + dst->ne[0] = src0->ne[1]; + dst->ne[1] = src1->ne[1]; + dst->ne[2] = src1->ne[2]; + dst->ne[3] = src1->ne[3]; + + dst->nb[0] = 4; + dst->nb[1] = dst->nb[0] * dst->ne[0]; + dst->nb[2] = dst->nb[1] * dst->ne[1]; + dst->nb[3] = dst->nb[2] * dst->ne[2]; + ggmlhexagon_dump_tensor(dst, 0); + + GGML_TENSOR_BINARY_OP_LOCALS + + const bool src1_cont = ggml_is_contiguous(src1); + + // broadcast factors + const int32_t r2 = ne12 / ne02; + const int32_t r3 = ne13 / ne03; + + if (ir0_start >= ir0_end || ir1_start >= ir1_end) { + return; + } + + const void * wdata = src1->data; + const size_t row_size = 4* ne10; + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int32_t blck_0 = 16; + const int32_t blck_1 = 16; + + const size_t src1_col_stride = src1_cont || nb11; + + // attempt to reduce false-sharing (does not seem to make a difference) + // 16 * 2, accounting for mmla kernels + float tmp[32]; + + for (int32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { + for (int32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { + for (int32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { + const int32_t i13 = (ir1 / (ne12 * ne1)); + const int32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; + const int32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); + + // broadcast src0 into src1 + const int32_t i03 = i13 / r3; + const int32_t i02 = i12 / r2; + + const int32_t i1 = i11; + const int32_t i2 = i12; + const int32_t i3 = i13; + + const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + const char * src1_col = (const char*)wdata + + (src1_cont + ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12 + i13 * nb13)); + float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { + vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), + (float*)(src0_row + ir0 * nb01), (num_rows_per_vec_dot > 1 ? nb01 : 0), + (float*)src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); + } + + for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { + memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); + } + } + } + } +} + +static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + ggmlhexagon_dump_tensor(src0, 0); + ggmlhexagon_dump_tensor(src1, 0); + ggmlhexagon_dump_tensor(dst, 0); + + dst->ne[0] = src0->ne[1]; + dst->ne[1] = src1->ne[1]; + dst->ne[2] = src1->ne[2]; + dst->ne[3] = src1->ne[3]; + + dst->nb[0] = 4; + dst->nb[1] = dst->nb[0] * dst->ne[0]; + dst->nb[2] = dst->nb[1] * dst->ne[1]; + dst->nb[3] = dst->nb[2] * dst->ne[2]; + ggmlhexagon_dump_tensor(dst, 0); + + GGML_TENSOR_BINARY_OP_LOCALS + + int32_t const vec_dot_num_rows = 1; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == 4); + GGML_ASSERT(nb10 == 4); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + +#if 0 //naive algorithm for fp32, can pass various case in UT + { + //ggml_dump_tensor(src0); + //ggml_dump_tensor(src1); + + float * a = (float*)src0->data; + float * b = (float*)src1->data; + float * c = (float*)dst->data; + int M = src0->ne[1]; + int K = src0->ne[0]; + int N = src1->ne[1]; + float sum = 0; + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + sum = 0; + for (int h = 0; h < K; h++) { + sum += a[i * K + h] * b[h * N + j]; + } + c[i * N + j] = sum; + } + } + return 0; + } +#endif + + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) + const int32_t nr0 = ne0; + + // This is the size of the rest of the dimensions of the result + const int32_t nr1 = ne1 * ne2 * ne3; + + // Now select a reasonable chunk size. + int chunk_size = 16; + + // We need to step up the size if it's small + if (nr0 == 1 || nr1 == 1) { + chunk_size = 64; + } + + // distribute the work across the inner or outer loop based on which one is larger + // The number of chunks in the 0/1 dim. + // CEIL(nr0/chunk_size) + int32_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; + int32_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; + + // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread. + // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915 + // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that. + if (nchunk0 * nchunk1 < 4) { + // distribute the thread work across the inner or outer loop based on which one is larger + nchunk0 = 1; // parallelize by src0 rows + nchunk1 = 1; // parallelize by src1 rows + } + + // The number of elements in each chunk + const int32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; + const int32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; + + // The first chunk comes from our thread_id, the rest will get auto-assigned. + int current_chunk = 0; + + while (current_chunk < nchunk0 * nchunk1) { + const int32_t ith0 = current_chunk % nchunk0; + const int32_t ith1 = current_chunk / nchunk0; + + const int32_t ir0_start = dr0 * ith0; + const int32_t ir0_end = MIN(ir0_start + dr0, nr0); + + const int32_t ir1_start = dr1 * ith1; + const int32_t ir1_end = MIN(ir1_start + dr1, nr1); + + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int32_t num_rows_per_vec_dot = vec_dot_num_rows; + + // these checks are needed to avoid crossing dim1 boundaries + // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity + if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) { + num_rows_per_vec_dot = 1; + } + ggml_compute_forward_mul_mat_one_chunk(src0, src1, dst, src0->type, num_rows_per_vec_dot, + ir0_start, ir0_end, ir1_start, ir1_end); + + if (1 >= nchunk0 * nchunk1) { + break; + } + current_chunk++; + } + + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} + +static int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} + +int ggmlop_dsp_mulmat(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) { + if (ggmlop_get_thread_counts() > 1) { + return ggmlop_dsp_mulmat_multithread(h, src0, src1, dst); + } else { + return ggmlop_dsp_mulmat_singlethread(h, src0, src1, dst); + } +} diff --git a/ggml/src/ggml-hexagon/kernels/skel.c b/ggml/src/ggml-hexagon/kernels/skel.c new file mode 100644 index 0000000000000..b216d66a654ab --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/skel.c @@ -0,0 +1,621 @@ +//qidl copyright +//qidl nested=false +#include "skel.h" + +#include +#ifndef _WIN32 +#include "HAP_farf.h" +#endif //_WIN32 for HAP_farf +#ifndef _ALLOCATOR_H +#define _ALLOCATOR_H + +#include +#include + +typedef struct _heap _heap; +struct _heap { + _heap* pPrev; + const char* loc; + uint64_t buf; +}; + +typedef struct _allocator { + _heap* pheap; + uint8_t* stack; + uint8_t* stackEnd; + int nSize; +} _allocator; + +_ATTRIBUTE_UNUSED +static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) { + _heap* pn = 0; + pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t)); + if(pn != 0) { + pn->pPrev = *ppa; + pn->loc = loc; + *ppa = pn; + *ppbuf = (void*)&(pn->buf); + return 0; + } else { + return -1; + } +} +#define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1)) + +_ATTRIBUTE_UNUSED +static __inline int _allocator_alloc(_allocator* me, + const char* loc, + int size, + unsigned int al, + void** ppbuf) { + if(size < 0) { + return -1; + } else if (size == 0) { + *ppbuf = 0; + return 0; + } + if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) { + *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al); + me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size; + return 0; + } else { + return _heap_alloc(&me->pheap, loc, size, ppbuf); + } +} + +_ATTRIBUTE_UNUSED +static __inline void _allocator_deinit(_allocator* me) { + _heap* pa = me->pheap; + while(pa != 0) { + _heap* pn = pa; + const char* loc = pn->loc; + (void)loc; + pa = pn->pPrev; + FREE(pn); + } +} + +_ATTRIBUTE_UNUSED +static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) { + me->stack = stack; + me->stackEnd = stack + stackSize; + me->nSize = stackSize; + me->pheap = 0; +} + + +#endif // _ALLOCATOR_H + +#ifndef SLIM_H +#define SLIM_H + +#include + +//a C data structure for the idl types that can be used to implement +//static and dynamic language bindings fairly efficiently. +// +//the goal is to have a minimal ROM and RAM footprint and without +//doing too many allocations. A good way to package these things seemed +//like the module boundary, so all the idls within one module can share +//all the type references. + + +#define PARAMETER_IN 0x0 +#define PARAMETER_OUT 0x1 +#define PARAMETER_INOUT 0x2 +#define PARAMETER_ROUT 0x3 +#define PARAMETER_INROUT 0x4 + +//the types that we get from idl +#define TYPE_OBJECT 0x0 +#define TYPE_INTERFACE 0x1 +#define TYPE_PRIMITIVE 0x2 +#define TYPE_ENUM 0x3 +#define TYPE_STRING 0x4 +#define TYPE_WSTRING 0x5 +#define TYPE_STRUCTURE 0x6 +#define TYPE_UNION 0x7 +#define TYPE_ARRAY 0x8 +#define TYPE_SEQUENCE 0x9 + +//these require the pack/unpack to recurse +//so it's a hint to those languages that can optimize in cases where +//recursion isn't necessary. +#define TYPE_COMPLEX_STRUCTURE (0x10 | TYPE_STRUCTURE) +#define TYPE_COMPLEX_UNION (0x10 | TYPE_UNION) +#define TYPE_COMPLEX_ARRAY (0x10 | TYPE_ARRAY) +#define TYPE_COMPLEX_SEQUENCE (0x10 | TYPE_SEQUENCE) + + +typedef struct Type Type; + +#define INHERIT_TYPE\ + int32_t nativeSize; /*in the simple case its the same as wire size and alignment*/\ + union {\ + struct {\ + const uintptr_t p1;\ + const uintptr_t p2;\ + } _cast;\ + struct {\ + uint32_t iid;\ + uint32_t bNotNil;\ + } object;\ + struct {\ + const Type *arrayType;\ + int32_t nItems;\ + } array;\ + struct {\ + const Type *seqType;\ + int32_t nMaxLen;\ + } seqSimple; \ + struct {\ + uint32_t bFloating;\ + uint32_t bSigned;\ + } prim; \ + const SequenceType* seqComplex;\ + const UnionType *unionType;\ + const StructType *structType;\ + int32_t stringMaxLen;\ + uint8_t bInterfaceNotNil;\ + } param;\ + uint8_t type;\ + uint8_t nativeAlignment\ + +typedef struct UnionType UnionType; +typedef struct StructType StructType; +typedef struct SequenceType SequenceType; +struct Type { + INHERIT_TYPE; +}; + +struct SequenceType { + const Type * seqType; + uint32_t nMaxLen; + uint32_t inSize; + uint32_t routSizePrimIn; + uint32_t routSizePrimROut; +}; + +//byte offset from the start of the case values for +//this unions case value array. it MUST be aligned +//at the alignment requrements for the descriptor +// +//if negative it means that the unions cases are +//simple enumerators, so the value read from the descriptor +//can be used directly to find the correct case +typedef union CaseValuePtr CaseValuePtr; +union CaseValuePtr { + const uint8_t* value8s; + const uint16_t* value16s; + const uint32_t* value32s; + const uint64_t* value64s; +}; + +//these are only used in complex cases +//so I pulled them out of the type definition as references to make +//the type smaller +struct UnionType { + const Type *descriptor; + uint32_t nCases; + const CaseValuePtr caseValues; + const Type * const *cases; + int32_t inSize; + int32_t routSizePrimIn; + int32_t routSizePrimROut; + uint8_t inAlignment; + uint8_t routAlignmentPrimIn; + uint8_t routAlignmentPrimROut; + uint8_t inCaseAlignment; + uint8_t routCaseAlignmentPrimIn; + uint8_t routCaseAlignmentPrimROut; + uint8_t nativeCaseAlignment; + uint8_t bDefaultCase; +}; + +struct StructType { + uint32_t nMembers; + const Type * const *members; + int32_t inSize; + int32_t routSizePrimIn; + int32_t routSizePrimROut; + uint8_t inAlignment; + uint8_t routAlignmentPrimIn; + uint8_t routAlignmentPrimROut; +}; + +typedef struct Parameter Parameter; +struct Parameter { + INHERIT_TYPE; + uint8_t mode; + uint8_t bNotNil; +}; + +#define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64)) +#define SLIM_SCALARS_IS_DYNAMIC(u) (((u) & 0x00ffffff) == 0x00ffffff) + +typedef struct Method Method; +struct Method { + uint32_t uScalars; //no method index + int32_t primInSize; + int32_t primROutSize; + int maxArgs; + int numParams; + const Parameter * const *params; + uint8_t primInAlignment; + uint8_t primROutAlignment; +}; + +typedef struct Interface Interface; + +struct Interface { + int nMethods; + const Method * const *methodArray; + int nIIds; + const uint32_t *iids; + const uint16_t* methodStringArray; + const uint16_t* methodStrings; + const char* strings; +}; + + +#endif //SLIM_H + + +#ifndef _GGMLOP_SLIM_H +#define _GGMLOP_SLIM_H +#include + +#ifndef __QAIC_SLIM +#define __QAIC_SLIM(ff) ff +#endif +#ifndef __QAIC_SLIM_EXPORT +#define __QAIC_SLIM_EXPORT +#endif + +static const Type types[5]; +static const Type* const typeArrays[7] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[2]),&(types[0]),&(types[3])}; +static const StructType structTypes[1] = {{0x7,&(typeArrays[0]),0x70,0x4,0x6c,0x4,0x4,0x4}}; +static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{0x40,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x10}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}}; +static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}}; +static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))}; +static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xe4,0x6c,3,3,(&(parameterArrays[0])),0x4,0x4}}; +static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])}; +static const char strings[167] = "dsp_setclocks\0dsp_rmsnorm\0dsp_softmax\0dcvs_enable\0power_level\0dsp_pool2d\0dsp_mulmat\0op_params\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0"; +static const uint16_t methodStrings[134] = {62,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,14,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,26,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,73,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,94,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,0,50,102,38,142,151,164,116,164}; +static const uint16_t methodStringsArrays[8] = {129,132,125,100,75,50,25,0}; +__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings}; +#endif //_GGMLOP_SLIM_H +extern int adsp_mmap_fd_getinfo(int, uint32_t *); +#ifdef __cplusplus +extern "C" { +#endif +_ATTRIBUTE_VISIBILITY uint32_t ggmldsp_skel_handle_invoke_qaic_version = 10048; +_ATTRIBUTE_VISIBILITY char ggmldsp_skel_handle_invoke_uri[79+1]="file:///libggmldsp-skel.so?ggmldsp_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"; +static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + int _nErr = 0; + remote_arg* _praROutPostStart = _praROutPost; + remote_arg** _ppraROutPostStart = _ppraROutPost; + _ppraROutPost = &_praROutPost; + _COPY(_primROut, 0, _rout0, 0, 4); + _COPY(_primROut, 4, _rout1, 0, 16); + _COPY(_primROut, 20, _rout2, 0, 16); + _COPY(_primROut, 36, _rout3, 0, 4); + _COPY(_primROut, 40, _rout4, 0, 64); + _COPY(_primROut, 104, _rout5, 0, 4); + _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1; + return _nErr; +} +static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + int _nErr = 0; + remote_arg* _praInStart = _praIn; + remote_arg** _ppraInStart = _ppraIn; + remote_arg* _praROutStart = _praROut; + remote_arg** _ppraROutStart = _ppraROut; + _ppraIn = &_praIn; + _ppraROut = &_praROut; + _COPY(_rout6Len, 0, _primIn, 0, 4); + _QAIC_ASSERT(_nErr, ((_praROut[0].buf.nLen / 4)) >= (size_t)(_rout6Len[0])); + _rout6[0] = _praROut[0].buf.pv; + _ppraInStart[0] += (_praIn - _praInStart) + 0; + _ppraROutStart[0] += (_praROut - _praROutStart) +1; + _QAIC_CATCH(_nErr) {} + return _nErr; +} +static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) { + int _nErr = 0; + remote_arg* _praInStart = _praIn; + remote_arg** _ppraInStart = _ppraIn; + remote_arg* _praROutStart = _praROut; + remote_arg** _ppraROutStart = _ppraROut; + _ppraIn = &_praIn; + _ppraROut = &_praROut; + _COPY(_in0, 0, _primIn, 0, 4); + _COPY(_in1, 0, _primIn, 4, 16); + _COPY(_in2, 0, _primIn, 20, 16); + _COPY(_in3, 0, _primIn, 36, 4); + _COPY(_in4, 0, _primIn, 40, 64); + _COPY(_in5, 0, _primIn, 104, 4); + _COPY(_in6Len, 0, _primIn, 108, 4); + _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 4)) >= (size_t)(_in6Len[0])); + _in6[0] = _praIn[0].buf.pv; + _ppraInStart[0] += (_praIn - _praInStart) + 1; + _ppraROutStart[0] += (_praROut - _praROutStart) +0; + _QAIC_CATCH(_nErr) {} + return _nErr; +} +static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*, const dsptensor*, dsptensor*), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) { + remote_arg* _praEnd = 0; + uintptr_t _in0[SLIM_IFPTR32(29, 16)] = {0}; + uintptr_t _in1[SLIM_IFPTR32(29, 16)] = {0}; + uintptr_t _rout2[SLIM_IFPTR32(29, 16)] = {0}; + uint32_t* _primIn= 0; + int _numIn[1] = {0}; + uint32_t* _primROut= 0; + int _numInH[1] = {0}; + int _numROut[1] = {0}; + remote_arg* _praIn = 0; + remote_arg* _praROut = 0; + remote_arg* _praROutPost = 0; + remote_arg** _ppraROutPost = &_praROutPost; + _allocator _al[1] = {{0}}; + remote_arg** _ppraIn = &_praIn; + remote_arg** _ppraROut = &_praROut; + remote_arg* _praHIn = 0; + remote_arg** _ppraHIn = &_praHIn; + remote_arg* _praHROut = 0; + remote_arg** _ppraHROut = &_praHROut; + int _nErr = 0; + _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc)); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)>=1); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)>=1); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, (_pra + ((1 + 1) + (((0 + 0) + 0) + 0))) <= _praEnd); + _numIn[0] = (REMOTE_SCALARS_INBUFS(_sc) - 1); + _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 228); + _primIn = _pra[0].buf.pv; + _QAIC_ASSERT(_nErr, _pra[(_numIn[0] + 1)].buf.nLen >= 108); + _primROut = _pra[(_numIn[0] + 1)].buf.pv; + _numInH[0] = REMOTE_SCALARS_INHANDLES(_sc); + _numROut[0] = REMOTE_SCALARS_OUTBUFS(_sc); + _praIn = (_pra + 1); + _praROut = (_praIn + _numIn[0] + 1); + _praROutPost = _praROut; + _allocator_init(_al, 0, 0); + if(_praHIn == 0) + { + _praHIn = ((_praROut + _numROut[0]) + 1); + } + if(_praHROut == 0) + (_praHROut = _praHIn + _numInH[0] + 0); + _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30])))); + _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])))); + _TRY(_nErr, _skel_unpack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])))); + _TRY(_nErr, _pfn(_h, (const dsptensor*)_in0, (const dsptensor*)_in1, (dsptensor*)_rout2)); + _TRY(_nErr, _skel_pack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])))); + _QAIC_CATCH(_nErr) {} + _allocator_deinit(_al); + return _nErr; +} +static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, int32, int32), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) { + remote_arg* _praEnd = 0; + uint32_t _in0[1] = {0}; + uint32_t _in1[1] = {0}; + uint32_t _in2[1] = {0}; + uint32_t _in3[1] = {0}; + uint32_t* _primIn= 0; + int _nErr = 0; + _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc)); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==1); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, (_pra + ((1 + 0) + (((0 + 0) + 0) + 0))) <= _praEnd); + _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 12); + _primIn = _pra[0].buf.pv; + _COPY(_in0, 0, _primIn, 0, 4); + _COPY(_in1, 0, _primIn, 4, 4); + _COPY(_in2, 0, _primIn, 8, 4); + _COPY(_in3, 0, _primIn, 12, 4); + _TRY(_nErr, _pfn(_h, (int32)*_in0, (int32)*_in1, (int32)*_in2, (int32)*_in3)); + _QAIC_CATCH(_nErr) {} + return _nErr; +} +static __inline int _skel_method_2(int (*_pfn)(remote_handle64), uint32_t _sc, remote_arg* _pra) { + remote_arg* _praEnd = 0; + remote_handle64 _in0[1] = {0}; + remote_arg* _praRHandleIn = _pra + REMOTE_SCALARS_INBUFS(_sc) + REMOTE_SCALARS_OUTBUFS(_sc); + int _nErr = 0; + _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc)); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==1); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, (_pra + ((0 + 0) + (((1 + 0) + 0) + 0))) <= _praEnd); + _COPY(_in0, 0, &(_praRHandleIn[0].h64), 0, sizeof(remote_handle64)); + _TRY(_nErr, _pfn((remote_handle64)*_in0)); + _QAIC_CATCH(_nErr) {} + return _nErr; +} +static __inline int _compare_versions(char* stub_ver, char* skel_ver, int* result) { + unsigned long int major_stub = 0, minor_stub = 0, patch_stub = 0; + unsigned long int major_skel = 0, minor_skel = 0, patch_skel = 0; + char *saveptr1 = NULL; + char *token1 = NULL; + char *saveptr2 = NULL; + char *token2 = NULL; + int i=0; + for (i=0, token1 = strtok_r(stub_ver, ".", &saveptr1); i<3 && token1 != NULL; i++, token1 = strtok_r(NULL, ".", &saveptr1)) + { + unsigned long int tn = strtoul(token1, NULL,10); + if( tn > 999) + { + *result=-1; + return 0; + } + else + { + if(i==0) major_stub=tn; + if(i==1) minor_stub=tn; + if(i==2) patch_stub=tn; + } + } + for (i=0, token2 = strtok_r(skel_ver, ".", &saveptr2); i<3 && token2 != NULL; i++, token2 = strtok_r(NULL, ".", &saveptr2)) + { + unsigned long int tn = strtoul(token2, NULL,10); + if( tn > 999) + { + *result=-1; + return 0; + } + else + { + if(i==0) major_skel=tn; + if(i==1) minor_skel=tn; + if(i==2) patch_skel=tn; + } + } + if(major_stub=patch_stub)) + { + *result=1; + return 0; + } + } + *result=-1; + return 0; +} +static __inline int _stub_skel_version_check(char*_in0, int* resVal) { + int _nErr = 0; + char* p = strstr(_in0, "_idlver="); + if(!p) + { + *resVal = -1; + return 0; + } + p+=8; + int i=0,len=0, comVer=0,num_delimit=0, updtInxStub=0, updtInxSkel=0; + for(i=0;i2) + { + *resVal = -1; + return 0; + } + if ((p[i]>='0' && p[i]<='9') || (p[i]=='.')) + { + len++; + if(p[i]=='.') + { + num_delimit++; + } + } + else if(p[i]=='&') + { + break; + } + else + { + *resVal = -1; + return 0; + } + } + char* stubVer=(char*)MALLOC(len+1); + _QAIC_ASSERT(_nErr, stubVer!=NULL); + for(i=0;i='0' && p[i]<='9') || (p[i]=='.')) + { + stubVer[updtInxStub]=p[i]; + updtInxStub++; + } + else if(p[i]=='&') + { + break; + } + } + stubVer[len]='\0'; + char* skelVer=(char*)MALLOC(strlen(IDL_VERSION)+1); + _QAIC_ASSERT(_nErr, skelVer!=NULL); + for(i=0;i< strlen(IDL_VERSION);i++) + { + skelVer[updtInxSkel]=IDL_VERSION[i]; + updtInxSkel++; + } + skelVer[strlen(IDL_VERSION)]='\0'; + _TRY(_nErr, _compare_versions(stubVer, skelVer, &comVer)); + *resVal = 0; + if (comVer==-1) + { + *resVal = -1; + } + FREE(stubVer); + FREE(skelVer); + _QAIC_CATCH(_nErr) {} + return 0; +} +static __inline int _skel_method_3(int (*_pfn)(const char*, remote_handle64*), uint32_t _sc, remote_arg* _pra) { + remote_arg* _praEnd = 0; + char* _in0[1] = {0}; + uint32_t _in0Len[1] = {0}; + remote_handle64 _rout1[1] = {0}; + uint32_t* _primIn= 0; + remote_arg* _praRHandleROut = _pra + REMOTE_SCALARS_INBUFS(_sc) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) ; + remote_arg* _praIn = 0; + int _nErr = 0; + _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc)); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==2); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==1); + _QAIC_ASSERT(_nErr, (_pra + ((2 + 0) + (((0 + 1) + 0) + 0))) <= _praEnd); + _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 4); + _primIn = _pra[0].buf.pv; + _COPY(_in0Len, 0, _primIn, 0, 4); + _praIn = (_pra + 1); + _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 1)) >= (size_t)(_in0Len[0])); + _in0[0] = _praIn[0].buf.pv; + _QAIC_ASSERT(_nErr, (_in0Len[0] > 0) && (_in0[0][(_in0Len[0] - 1)] == 0)); + int resVal; + _TRY(_nErr, _stub_skel_version_check(*_in0, &resVal)); + if(resVal==-1) + { + return AEE_ESTUBSKELVERMISMATCH; + } + _TRY(_nErr, _pfn((const char*)*_in0, (remote_handle64*)_rout1)); + _COPY(&(_praRHandleROut[0].h64), 0, _rout1, 0, sizeof(remote_handle64)); + _QAIC_CATCH(_nErr) {} + return _nErr; +} +__QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmldsp_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE { + switch(REMOTE_SCALARS_METHOD(_sc)){ + case 0: + return _skel_method_3(__QAIC_IMPL(ggmlop_dsp_open), _sc, _pra); + case 1: + return _skel_method_2(__QAIC_IMPL(ggmlop_dsp_close), _sc, _pra); + case 2: + return _skel_method_1(__QAIC_IMPL(ggmlop_dsp_setclocks), _h, _sc, _pra); + case 3: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_add), _h, _sc, _pra); + case 4: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_mulmat), _h, _sc, _pra); + case 5: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_softmax), _h, _sc, _pra); + case 6: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_rmsnorm), _h, _sc, _pra); + case 7: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_pool2d), _h, _sc, _pra); + } + return AEE_EUNSUPPORTED; +} diff --git a/ggml/src/ggml-hexagon/kernels/skel.h b/ggml/src/ggml-hexagon/kernels/skel.h new file mode 100644 index 0000000000000..f77e8101d14df --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/skel.h @@ -0,0 +1,285 @@ +#ifndef _SKEL_H +#define _SKEL_H +//qidl copyright +//qidl nested=false +#include +#include +#include +#include + + +#ifndef __QAIC_HEADER +#define __QAIC_HEADER(ff) ff +#endif //__QAIC_HEADER + +#ifndef __QAIC_HEADER_EXPORT +#define __QAIC_HEADER_EXPORT +#endif // __QAIC_HEADER_EXPORT + +#ifndef __QAIC_HEADER_ATTRIBUTE +#define __QAIC_HEADER_ATTRIBUTE +#endif // __QAIC_HEADER_ATTRIBUTE + +#ifndef __QAIC_IMPL +#define __QAIC_IMPL(ff) ff +#endif //__QAIC_IMPL + +#ifndef __QAIC_IMPL_EXPORT +#define __QAIC_IMPL_EXPORT +#endif // __QAIC_IMPL_EXPORT + +#ifndef __QAIC_IMPL_ATTRIBUTE +#define __QAIC_IMPL_ATTRIBUTE +#endif // __QAIC_IMPL_ATTRIBUTE +#ifndef _QAIC_ENV_H +#define _QAIC_ENV_H + +#include +#ifdef _WIN32 +#include "qtest_stdlib.h" +#else +#define MALLOC malloc +#define FREE free +#endif + +#ifdef __GNUC__ +#ifdef __clang__ +#pragma GCC diagnostic ignored "-Wunknown-pragmas" +#else +#pragma GCC diagnostic ignored "-Wpragmas" +#endif +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#ifndef _ATTRIBUTE_UNUSED + +#ifdef _WIN32 +#define _ATTRIBUTE_UNUSED +#else +#define _ATTRIBUTE_UNUSED __attribute__ ((unused)) +#endif + +#endif // _ATTRIBUTE_UNUSED + +#ifndef _ATTRIBUTE_VISIBILITY + +#ifdef _WIN32 +#define _ATTRIBUTE_VISIBILITY +#else +#define _ATTRIBUTE_VISIBILITY __attribute__ ((visibility("default"))) +#endif + +#endif // _ATTRIBUTE_VISIBILITY + +#ifndef __QAIC_REMOTE +#define __QAIC_REMOTE(ff) ff +#endif //__QAIC_REMOTE + +#ifndef __QAIC_HEADER +#define __QAIC_HEADER(ff) ff +#endif //__QAIC_HEADER + +#ifndef __QAIC_HEADER_EXPORT +#define __QAIC_HEADER_EXPORT +#endif // __QAIC_HEADER_EXPORT + +#ifndef __QAIC_HEADER_ATTRIBUTE +#define __QAIC_HEADER_ATTRIBUTE +#endif // __QAIC_HEADER_ATTRIBUTE + +#ifndef __QAIC_IMPL +#define __QAIC_IMPL(ff) ff +#endif //__QAIC_IMPL + +#ifndef __QAIC_IMPL_EXPORT +#define __QAIC_IMPL_EXPORT +#endif // __QAIC_IMPL_EXPORT + +#ifndef __QAIC_IMPL_ATTRIBUTE +#define __QAIC_IMPL_ATTRIBUTE +#endif // __QAIC_IMPL_ATTRIBUTE + +#ifndef __QAIC_STUB +#define __QAIC_STUB(ff) ff +#endif //__QAIC_STUB + +#ifndef __QAIC_STUB_EXPORT +#define __QAIC_STUB_EXPORT +#endif // __QAIC_STUB_EXPORT + +#ifndef __QAIC_STUB_ATTRIBUTE +#define __QAIC_STUB_ATTRIBUTE +#endif // __QAIC_STUB_ATTRIBUTE + +#ifndef __QAIC_SKEL +#define __QAIC_SKEL(ff) ff +#endif //__QAIC_SKEL__ + +#ifndef __QAIC_SKEL_EXPORT +#define __QAIC_SKEL_EXPORT +#endif // __QAIC_SKEL_EXPORT + +#ifndef __QAIC_SKEL_ATTRIBUTE +#define __QAIC_SKEL_ATTRIBUTE +#endif // __QAIC_SKEL_ATTRIBUTE + +#ifdef __QAIC_DEBUG__ + #ifndef __QAIC_DBG_PRINTF__ + #include + #define __QAIC_DBG_PRINTF__( ee ) do { printf ee ; } while(0) + #endif +#else + #define __QAIC_DBG_PRINTF__( ee ) (void)0 +#endif + + +#define _OFFSET(src, sof) ((void*)(((char*)(src)) + (sof))) + +#define _COPY(dst, dof, src, sof, sz) \ + do {\ + struct __copy { \ + char ar[sz]; \ + };\ + *(struct __copy*)_OFFSET(dst, dof) = *(struct __copy*)_OFFSET(src, sof);\ + } while (0) + +#define _COPYIF(dst, dof, src, sof, sz) \ + do {\ + if(_OFFSET(dst, dof) != _OFFSET(src, sof)) {\ + _COPY(dst, dof, src, sof, sz); \ + } \ + } while (0) + +_ATTRIBUTE_UNUSED +static __inline void _qaic_memmove(void* dst, void* src, int size) { + int i = 0; + for(i = 0; i < size; ++i) { + ((char*)dst)[i] = ((char*)src)[i]; + } +} + +#define _MEMMOVEIF(dst, src, sz) \ + do {\ + if(dst != src) {\ + _qaic_memmove(dst, src, sz);\ + } \ + } while (0) + + +#define _ASSIGN(dst, src, sof) \ + do {\ + dst = OFFSET(src, sof); \ + } while (0) + +#define _STD_STRLEN_IF(str) (str == 0 ? 0 : strlen(str)) + +#include "AEEStdErr.h" + +#ifdef _WIN32 +#define _QAIC_FARF(level, msg, ...) (void)0 +#else +#define _QAIC_FARF(level, msg, ...) (void)0 +#endif //_WIN32 for _QAIC_FARF + +#define _TRY(ee, func) \ + do { \ + if (AEE_SUCCESS != ((ee) = func)) {\ + __QAIC_DBG_PRINTF__((__FILE__ ":%d:error:%d:%s\n", __LINE__, (int)(ee),#func));\ + goto ee##bail;\ + } \ + } while (0) + +#define _TRY_FARF(ee, func) \ + do { \ + if (AEE_SUCCESS != ((ee) = func)) {\ + goto ee##farf##bail;\ + } \ + } while (0) + +#define _QAIC_CATCH(exception) exception##bail: if (exception != AEE_SUCCESS) + +#define _CATCH_FARF(exception) exception##farf##bail: if (exception != AEE_SUCCESS) + +#define _QAIC_ASSERT(nErr, ff) _TRY(nErr, 0 == (ff) ? AEE_EBADPARM : AEE_SUCCESS) + +#ifdef __QAIC_DEBUG__ +#define _QAIC_ALLOCATE(nErr, pal, size, alignment, pv) _TRY(nErr, _allocator_alloc(pal, __FILE_LINE__, size, alignment, (void**)&pv));\ + _QAIC_ASSERT(nErr,pv || !(size)) +#else +#define _QAIC_ALLOCATE(nErr, pal, size, alignment, pv) _TRY(nErr, _allocator_alloc(pal, 0, size, alignment, (void**)&pv));\ + _QAIC_ASSERT(nErr,pv || !(size)) +#endif + + +#endif // _QAIC_ENV_H + +#ifdef __cplusplus +extern "C" { +#endif +#if !defined(__QAIC_STRING1_OBJECT_DEFINED__) && !defined(__STRING1_OBJECT__) +#define __QAIC_STRING1_OBJECT_DEFINED__ +#define __STRING1_OBJECT__ +typedef struct _cstring1_s { + char* data; + int dataLen; +} _cstring1_t; + +#endif /* __QAIC_STRING1_OBJECT_DEFINED__ */ +/// Enabling stub-skel mismatch check feature in the auto-gen files. +/// Please refer to the IDL documentation for more details on the feature. +/// It is fully supported only on Kailua and later targets. +#define IDL_VERSION "0.0.1" +typedef struct dsptensor dsptensor; +struct dsptensor { + int32_t type; + int32_t ne[4]; + int32_t nb[4]; + int32_t op; + int32_t op_params[16]; + int32_t flags; + void * data; + int data_len; +}; +/** + * Opens the handle in the specified domain. If this is the first + * handle, this creates the session. Typically this means opening + * the device, aka open("/dev/adsprpc-smd"), then calling ioctl + * device APIs to create a PD on the DSP to execute our code in, + * then asking that PD to dlopen the .so and dlsym the skel function. + * + * @param uri, _URI"&_dom=aDSP" + * _URI is a QAIC generated uri, or + * "file:///?_skel_handle_invoke&_modver=1.0" + * If the _dom parameter is not present, _dom=DEFAULT is assumed + * but not forwarded. + * Reserved uri keys: + * [0]: first unamed argument is the skel invoke function + * _dom: execution domain name, _dom=mDSP/aDSP/DEFAULT + * _modver: module version, _modver=1.0 + * _*: any other key name starting with an _ is reserved + * Unknown uri keys/values are forwarded as is. + * @param h, resulting handle + * @retval, 0 on success + */ +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_HEADER_ATTRIBUTE; +/** + * Closes a handle. If this is the last handle to close, the session + * is closed as well, releasing all the allocated resources. + + * @param h, the handle to close + * @retval, 0 on success, should always succeed + */ +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 mulmat_algotype, int32 thread_counts) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_rmsnorm)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_pool2d)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; + +#ifdef __cplusplus +} +#endif +#endif //_SKEL_H diff --git a/ggml/src/ggml-hexagon/kernels/stub.c b/ggml/src/ggml-hexagon/kernels/stub.c new file mode 100644 index 0000000000000..7936c43cd6d77 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/stub.c @@ -0,0 +1,463 @@ +//qidl copyright +//qidl nested=false +#include "skel.h" +#include +#ifndef _WIN32 +#include "HAP_farf.h" +#include +#endif //_WIN32 for HAP_farf +#ifndef _ALLOCATOR_H +#define _ALLOCATOR_H + +#include +#include + +typedef struct _heap _heap; +struct _heap { + _heap* pPrev; + const char* loc; + uint64_t buf; +}; + +typedef struct _allocator { + _heap* pheap; + uint8_t* stack; + uint8_t* stackEnd; + int nSize; +} _allocator; + +_ATTRIBUTE_UNUSED +static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) { + _heap* pn = 0; + pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t)); + if(pn != 0) { + pn->pPrev = *ppa; + pn->loc = loc; + *ppa = pn; + *ppbuf = (void*)&(pn->buf); + return 0; + } else { + return -1; + } +} +#define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1)) + +_ATTRIBUTE_UNUSED +static __inline int _allocator_alloc(_allocator* me, + const char* loc, + int size, + unsigned int al, + void** ppbuf) { + if(size < 0) { + return -1; + } else if (size == 0) { + *ppbuf = 0; + return 0; + } + if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) { + *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al); + me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size; + return 0; + } else { + return _heap_alloc(&me->pheap, loc, size, ppbuf); + } +} + +_ATTRIBUTE_UNUSED +static __inline void _allocator_deinit(_allocator* me) { + _heap* pa = me->pheap; + while(pa != 0) { + _heap* pn = pa; + const char* loc = pn->loc; + (void)loc; + pa = pn->pPrev; + FREE(pn); + } +} + +_ATTRIBUTE_UNUSED +static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) { + me->stack = stack; + me->stackEnd = stack + stackSize; + me->nSize = stackSize; + me->pheap = 0; +} + + +#endif // _ALLOCATOR_H + +#ifndef SLIM_H +#define SLIM_H + +#include + +//a C data structure for the idl types that can be used to implement +//static and dynamic language bindings fairly efficiently. +// +//the goal is to have a minimal ROM and RAM footprint and without +//doing too many allocations. A good way to package these things seemed +//like the module boundary, so all the idls within one module can share +//all the type references. + + +#define PARAMETER_IN 0x0 +#define PARAMETER_OUT 0x1 +#define PARAMETER_INOUT 0x2 +#define PARAMETER_ROUT 0x3 +#define PARAMETER_INROUT 0x4 + +//the types that we get from idl +#define TYPE_OBJECT 0x0 +#define TYPE_INTERFACE 0x1 +#define TYPE_PRIMITIVE 0x2 +#define TYPE_ENUM 0x3 +#define TYPE_STRING 0x4 +#define TYPE_WSTRING 0x5 +#define TYPE_STRUCTURE 0x6 +#define TYPE_UNION 0x7 +#define TYPE_ARRAY 0x8 +#define TYPE_SEQUENCE 0x9 + +//these require the pack/unpack to recurse +//so it's a hint to those languages that can optimize in cases where +//recursion isn't necessary. +#define TYPE_COMPLEX_STRUCTURE (0x10 | TYPE_STRUCTURE) +#define TYPE_COMPLEX_UNION (0x10 | TYPE_UNION) +#define TYPE_COMPLEX_ARRAY (0x10 | TYPE_ARRAY) +#define TYPE_COMPLEX_SEQUENCE (0x10 | TYPE_SEQUENCE) + + +typedef struct Type Type; + +#define INHERIT_TYPE\ + int32_t nativeSize; /*in the simple case its the same as wire size and alignment*/\ + union {\ + struct {\ + const uintptr_t p1;\ + const uintptr_t p2;\ + } _cast;\ + struct {\ + uint32_t iid;\ + uint32_t bNotNil;\ + } object;\ + struct {\ + const Type *arrayType;\ + int32_t nItems;\ + } array;\ + struct {\ + const Type *seqType;\ + int32_t nMaxLen;\ + } seqSimple; \ + struct {\ + uint32_t bFloating;\ + uint32_t bSigned;\ + } prim; \ + const SequenceType* seqComplex;\ + const UnionType *unionType;\ + const StructType *structType;\ + int32_t stringMaxLen;\ + uint8_t bInterfaceNotNil;\ + } param;\ + uint8_t type;\ + uint8_t nativeAlignment\ + +typedef struct UnionType UnionType; +typedef struct StructType StructType; +typedef struct SequenceType SequenceType; +struct Type { + INHERIT_TYPE; +}; + +struct SequenceType { + const Type * seqType; + uint32_t nMaxLen; + uint32_t inSize; + uint32_t routSizePrimIn; + uint32_t routSizePrimROut; +}; + +//byte offset from the start of the case values for +//this unions case value array. it MUST be aligned +//at the alignment requrements for the descriptor +// +//if negative it means that the unions cases are +//simple enumerators, so the value read from the descriptor +//can be used directly to find the correct case +typedef union CaseValuePtr CaseValuePtr; +union CaseValuePtr { + const uint8_t* value8s; + const uint16_t* value16s; + const uint32_t* value32s; + const uint64_t* value64s; +}; + +//these are only used in complex cases +//so I pulled them out of the type definition as references to make +//the type smaller +struct UnionType { + const Type *descriptor; + uint32_t nCases; + const CaseValuePtr caseValues; + const Type * const *cases; + int32_t inSize; + int32_t routSizePrimIn; + int32_t routSizePrimROut; + uint8_t inAlignment; + uint8_t routAlignmentPrimIn; + uint8_t routAlignmentPrimROut; + uint8_t inCaseAlignment; + uint8_t routCaseAlignmentPrimIn; + uint8_t routCaseAlignmentPrimROut; + uint8_t nativeCaseAlignment; + uint8_t bDefaultCase; +}; + +struct StructType { + uint32_t nMembers; + const Type * const *members; + int32_t inSize; + int32_t routSizePrimIn; + int32_t routSizePrimROut; + uint8_t inAlignment; + uint8_t routAlignmentPrimIn; + uint8_t routAlignmentPrimROut; +}; + +typedef struct Parameter Parameter; +struct Parameter { + INHERIT_TYPE; + uint8_t mode; + uint8_t bNotNil; +}; + +#define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64)) +#define SLIM_SCALARS_IS_DYNAMIC(u) (((u) & 0x00ffffff) == 0x00ffffff) + +typedef struct Method Method; +struct Method { + uint32_t uScalars; //no method index + int32_t primInSize; + int32_t primROutSize; + int maxArgs; + int numParams; + const Parameter * const *params; + uint8_t primInAlignment; + uint8_t primROutAlignment; +}; + +typedef struct Interface Interface; + +struct Interface { + int nMethods; + const Method * const *methodArray; + int nIIds; + const uint32_t *iids; + const uint16_t* methodStringArray; + const uint16_t* methodStrings; + const char* strings; +}; + + +#endif //SLIM_H + + +#ifndef _GGMLOP_SLIM_H +#define _GGMLOP_SLIM_H +#include + +#ifndef __QAIC_SLIM +#define __QAIC_SLIM(ff) ff +#endif +#ifndef __QAIC_SLIM_EXPORT +#define __QAIC_SLIM_EXPORT +#endif + +static const Type types[5]; +static const Type* const typeArrays[7] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[2]),&(types[0]),&(types[3])}; +static const StructType structTypes[1] = {{0x7,&(typeArrays[0]),0x70,0x4,0x6c,0x4,0x4,0x4}}; +static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{0x40,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x10}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}}; +static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}}; +static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))}; +static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xe4,0x6c,3,3,(&(parameterArrays[0])),0x4,0x4}}; +static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])}; +static const char strings[167] = "dsp_setclocks\0dsp_rmsnorm\0dsp_softmax\0dcvs_enable\0power_level\0dsp_pool2d\0dsp_mulmat\0op_params\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0"; +static const uint16_t methodStrings[134] = {62,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,14,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,26,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,73,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,94,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,0,50,102,38,142,151,164,116,164}; +static const uint16_t methodStringsArrays[8] = {129,132,125,100,75,50,25,0}; +__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings}; +#endif //_GGMLOP_SLIM_H + + +#ifdef __cplusplus +extern "C" { +#endif +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_STUB_ATTRIBUTE { + return __QAIC_REMOTE(remote_handle64_open)(uri, h); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE { + return __QAIC_REMOTE(remote_handle64_close)(h); +} +static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1], uint32_t _in3[1]) { + remote_arg _pra[1] = {0}; + uint32_t _primIn[4]= {0}; + int _nErr = 0; + _pra[0].buf.pv = (void*)_primIn; + _pra[0].buf.nLen = sizeof(_primIn); + _COPY(_primIn, 0, _in0, 0, 4); + _COPY(_primIn, 4, _in1, 0, 4); + _COPY(_primIn, 8, _in2, 0, 4); + _COPY(_primIn, 12,_in3, 0, 4); + _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra)); + _CATCH_FARF(_nErr) { + _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__); + } + return _nErr; +} +__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 mulmat_algotype, int32 threads) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 2; + return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&mulmat_algotype, (uint32_t*)&threads); +} +static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + int _nErr = 0; + remote_arg* _praROutPostStart = _praROutPost; + remote_arg** _ppraROutPostStart = _ppraROutPost; + _ppraROutPost = &_praROutPost; + _COPY(_rout0, 0, _primROut, 0, 4); + _COPY(_rout1, 0, _primROut, 4, 16); + _COPY(_rout2, 0, _primROut, 20, 16); + _COPY(_rout3, 0, _primROut, 36, 4); + _COPY(_rout4, 0, _primROut, 40, 64); + _COPY(_rout5, 0, _primROut, 104, 4); + _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1; + return _nErr; +} +static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + int _nErr = 0; + remote_arg* _praInStart = _praIn; + remote_arg** _ppraInStart = _ppraIn; + remote_arg* _praROutStart = _praROut; + remote_arg** _ppraROutStart = _ppraROut; + _ppraIn = &_praIn; + _ppraROut = &_praROut; + _COPY(_primIn, 0, _rout6Len, 0, 4); + _praROut[0].buf.pv = _rout6[0]; + _praROut[0].buf.nLen = (4 * _rout6Len[0]); + _ppraInStart[0] += (_praIn - _praInStart) + 0; + _ppraROutStart[0] += (_praROut - _praROutStart) +1; + return _nErr; +} +static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) { + int _nErr = 0; + remote_arg* _praInStart = _praIn; + remote_arg** _ppraInStart = _ppraIn; + remote_arg* _praROutStart = _praROut; + remote_arg** _ppraROutStart = _ppraROut; + _ppraIn = &_praIn; + _ppraROut = &_praROut; + _COPY(_primIn, 0, _in0, 0, 4); + _COPY(_primIn, 4, _in1, 0, 16); + _COPY(_primIn, 20, _in2, 0, 16); + _COPY(_primIn, 36, _in3, 0, 4); + _COPY(_primIn, 40, _in4, 0, 64); + _COPY(_primIn, 104, _in5, 0, 4); + _COPY(_primIn, 108, _in6Len, 0, 4); + _praIn[0].buf.pv = (void*) _in6[0]; + _praIn[0].buf.nLen = (4 * _in6Len[0]); + _ppraInStart[0] += (_praIn - _praInStart) + 1; + _ppraROutStart[0] += (_praROut - _praROutStart) +0; + return _nErr; +} +static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + _numIn[0] += 0; + _numROut[0] += 1; + _numInH[0] += 0; + _numROutH[0] += 0; +} +static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) { + _numIn[0] += 1; + _numROut[0] += 0; + _numInH[0] += 0; + _numROutH[0] += 0; +} +static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(29, 16)], uintptr_t _in1[SLIM_IFPTR32(29, 16)], uintptr_t _rout2[SLIM_IFPTR32(29, 16)]) { + remote_arg* _pra = 0; + int _numIn[1] = {0}; + int _numROut[1] = {0}; + int _numInH[1] = {0}; + int _numROutH[1] = {0}; + _allocator _al[1] = {{0}}; + uint32_t _primIn[57]= {0}; + uint32_t _primROut[27]= {0}; + remote_arg* _praIn = 0; + remote_arg* _praROut = 0; + remote_arg* _praROutPost = 0; + remote_arg** _ppraROutPost = &_praROutPost; + remote_arg** _ppraIn = &_praIn; + remote_arg** _ppraROut = &_praROut; + remote_arg* _praHIn = 0; + remote_arg** _ppraHIn = &_praHIn; + remote_arg* _praHROut = 0; + remote_arg** _ppraHROut = &_praHROut; + int _nErr = 0; + _numIn[0] = 0; + _numROut[0] = 0; + _numInH[0] = 0; + _numROutH[0] = 0; + _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30]))); + _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30]))); + _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))); + if(_numIn[0]>=255){ + return AEE_EUNSUPPORTED; + } + if(_numROut[0]>=255){ + return AEE_EUNSUPPORTED; + } + _allocator_init(_al, 0, 0); + _QAIC_ALLOCATE(_nErr, _al, ((((((((_numIn[0] + _numROut[0]) + _numInH[0]) + _numROutH[0]) + 1) + 1) + 0) + 0) * sizeof(_pra[0])), 4, _pra); + _QAIC_ASSERT(_nErr, _pra); + _pra[0].buf.pv = (void*)_primIn; + _pra[0].buf.nLen = sizeof(_primIn); + _pra[(_numIn[0] + 1)].buf.pv = (void*)_primROut; + _pra[(_numIn[0] + 1)].buf.nLen = sizeof(_primROut); + _praIn = (_pra + 1); + _praROut = (_praIn + _numIn[0] + 1); + _praROutPost = _praROut; + if(_praHIn == 0) + { + _praHIn = ((_praROut + _numROut[0]) + 1); + } + if(_praHROut == 0) + (_praHROut = _praHIn + _numInH[0] + 0); + _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30])))); + _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])))); + _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])))); + _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15); + _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15); + _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra)); + _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])))); + _QAIC_CATCH(_nErr) {} + _CATCH_FARF(_nErr) { + _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__); + } + _allocator_deinit(_al); + return _nErr; +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 3; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 4; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_softmax)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 5; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_rmsnorm)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 6; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_pool2d)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 7; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} diff --git a/ggml/src/ggml-hexagon/kernels/worker_pool.cpp b/ggml/src/ggml-hexagon/kernels/worker_pool.cpp new file mode 100755 index 0000000000000..8186edcf18a95 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/worker_pool.cpp @@ -0,0 +1,475 @@ +/**============================================================================= + +@file + worker_pool.cpp + +@brief + Utility providing a multi-priority thread worker pool for + multi-threaded computer vision (or other compute) applications. + +Copyright (c) 2019-2020 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. + +Export of this technology or software is regulated by the U.S. +Government. Diversion contrary to U.S. law prohibited. + +All ideas, data and information contained in or disclosed by +this document are confidential and proprietary information of +Qualcomm Technologies Incorporated and all rights therein are expressly reserved. +By accepting this material the recipient agrees that this material +and the information contained therein are held in confidence and in +trust and will not be used, copied, reproduced in whole or in part, +nor its contents revealed in any manner to others without the express +written permission of Qualcomm Technologies Incorporated. + +=============================================================================**/ + +/*=========================================================================== + INCLUDE FILE +===========================================================================*/ +#include +#include +#include +#include "worker_pool.h" + +#ifndef _DEBUG +#define _DEBUG +#endif +#include "HAP_farf.h" + +#ifdef __cplusplus +extern "C" +{ +#endif +#include "qurt.h" +#include "hexagon_protos.h" + +void worker_pool_constructor(void) __attribute__((constructor)); +void worker_pool_destructor(void) __attribute__((destructor)); + +#ifdef __cplusplus +} +#endif + +/*=========================================================================== + DEFINE +===========================================================================*/ +#define WORKER_THREAD_STACK_SZ 2 *16384 +#define WORKER_KILL_SIGNAL 31 // signal to kill the worker threads +#define NUM_JOB_SLOTS (MAX_NUM_WORKERS + 1) // max queued jobs, slightly more than number of workers. +#define LOWEST_USABLE_QURT_PRIO 254 + +/*=========================================================================== + TYPEDEF +===========================================================================*/ +// internal structure kept in thread-local storage per instance of worker pool +typedef struct +{ + qurt_anysignal_t empty_jobs; // available job nodes + qurt_anysignal_t queued_jobs; // jobs that are waiting for a worker + qurt_mutex_t empty_jobs_mutex; // mutex for multiple threads trying to send a job + qurt_mutex_t queued_jobs_mutex; // mutex for multiple threads trying to acquire a job + unsigned int job_queue_mask; // mask for job queue nodes + unsigned int num_workers; // number of workers in this pool + worker_pool_job_t job[NUM_JOB_SLOTS]; // list of job descriptors + qurt_thread_t thread[MAX_NUM_WORKERS]; // thread ID's of the workers + void * stack[MAX_NUM_WORKERS]; // thread stack pointers +} worker_pool_t; + +// internal structure containing OS primitives to sync caller with all its spawned jobs. +typedef union +{ + worker_synctoken_t raw; + struct + { + unsigned int atomic_countdown; + unsigned int reserved; // reserved to align next element to 8 bytes + qurt_sem_t sem; + } sync; +} internal_synctoken_t; + +/*=========================================================================== + GLOBAL VARIABLES (per PD) +===========================================================================*/ +// initialized in constructor +unsigned int num_workers = 1; +unsigned int num_hvx128_contexts = 0; + +/*=========================================================================== + STATIC VARIABLES +===========================================================================*/ + +static worker_pool_context_t static_context = NULL; + +/*=========================================================================== + LOCAL FUNCTION +===========================================================================*/ +// the main workloop for each of the worker threads. +static void worker_pool_main(void* context) +{ + // local pointer to owning pool's context + worker_pool_t *me = (worker_pool_t *) context; + + // some local vars to reduce dereferencing inside loop + qurt_anysignal_t *signal = &me->queued_jobs; + unsigned int mask = me->job_queue_mask; + qurt_mutex_t *mutex = &me->queued_jobs_mutex; + + while(1) + { + qurt_mutex_lock(mutex); // mutex only allows 1 thread to wait on signal at a time. QuRT restriction. + (void) qurt_anysignal_wait(signal, mask); // wait for a job + unsigned int sig_rx = Q6_R_ct0_R(mask & qurt_anysignal_get(signal)); // count trailing 0's to choose flagged job + if (sig_rx < NUM_JOB_SLOTS) // if real job + { + worker_pool_job_t job = me->job[sig_rx]; // local copy of job descriptor + (void) qurt_anysignal_clear(signal, (1 << sig_rx)); // clear the queued job signal + (void) qurt_anysignal_set(&me->empty_jobs, (1 << sig_rx)); // send node back to empty list + qurt_mutex_unlock(mutex); // unlock the mutex + job.fptr(job.dptr); // issue the callback + } + else if (WORKER_KILL_SIGNAL == sig_rx) + { + // don't clear the kill signal, leave it for all the workers to see, and exit + qurt_mutex_unlock(mutex); + qurt_thread_exit(0); + } + else{ + FARF(HIGH,"Worker pool received invalid job %d", sig_rx ); + qurt_mutex_unlock(mutex); + } + // else ignore + } +} + +void worker_pool_constructor() +{ + FARF(HIGH, "In worker_pool constructor"); + qurt_sysenv_max_hthreads_t num_threads; + if (QURT_EOK != qurt_sysenv_get_max_hw_threads(&num_threads)) + { + num_workers = MAX_NUM_WORKERS; // Couldn't get number of threads from QuRT, default to 4. + FARF(HIGH, "Failed to get number of threads. Defaulting to %u", num_workers); + } + else + { + num_workers = num_threads.max_hthreads; + } + + /* Verify that number of hw threads isn't greater than max supported number of hw threads. + Max threads is used as a constant value for array size. */ + if (num_workers > MAX_NUM_WORKERS) + { + num_workers = MAX_NUM_WORKERS; + FARF(HIGH, "Limiting number of threads to maximum supported value %u", num_workers); + } + + num_hvx128_contexts = (qurt_hvx_get_units() >> 8) & 0xFF; + + /* initialize static worker_pool for clients who pass NULL as context.*/ + if (worker_pool_init(&static_context) != AEE_SUCCESS) + { + FARF(ERROR, "Could not initialize default worker pool"); + } +} + +AEEResult worker_pool_init_with_stack_size(worker_pool_context_t *context, int stack_size) +{ + int nErr = 0; + + if(stack_size <= 0) + { + FARF(ERROR, "Stack size can not be negative"); + return AEE_EBADPARM; + } + + if (NULL == context) + { + FARF(ERROR, "NULL context passed to worker_pool_init()."); + return AEE_EBADPARM; + } + + // Allocations + int size = (stack_size * num_workers) + (sizeof(worker_pool_t)); + unsigned char *mem_blob = (unsigned char*)malloc(size); + if (!mem_blob) + { + FARF(ERROR,"Could not allocate memory for worker pool!!"); + return AEE_ENOMEMORY; + } + + worker_pool_t *me = (worker_pool_t *)(mem_blob + stack_size * num_workers); + + // name for the first worker, useful in debugging threads + char name[19]; + snprintf(name, 12, "0x%8x:", (int)me); + strcat(name, "worker0"); + me->num_workers = num_workers; + // initializations + for (unsigned int i = 0; i < me->num_workers; i++) + { + me->stack[i] = NULL; + me->thread[i] = 0; + } + + // initialize job queue + qurt_anysignal_init(&(me->queued_jobs)); + qurt_anysignal_init(&(me->empty_jobs)); + qurt_mutex_init(&(me->empty_jobs_mutex)); + qurt_mutex_init(&(me->queued_jobs_mutex)); + me->job_queue_mask = (1 << NUM_JOB_SLOTS) - 1; // set a bit for each job node, number of job nodes = num_workers + 1 + (void) qurt_anysignal_set(&(me->empty_jobs), me->job_queue_mask); // fill the empty pool. + me->job_queue_mask |= (1 << WORKER_KILL_SIGNAL); // add the kill signal to the mask. + + // launch the workers + qurt_thread_attr_t attr; + qurt_thread_attr_init (&attr); + + for (unsigned int i = 0; i < me->num_workers; i++) + { + // set up stack + me->stack[i] = mem_blob; + mem_blob += stack_size; + qurt_thread_attr_set_stack_addr(&attr, me->stack[i]); + qurt_thread_attr_set_stack_size(&attr, stack_size); + + // set up name + qurt_thread_attr_set_name(&attr, name); + name[17] = (name[17] + 1); + // name threads context:worker0, context:worker1, .. (recycle at 9, but num threads should be less than that anyway) + if (name[17] > '9') name[17] = '0'; + // set up priority - by default, match the creating thread's prio + int prio = qurt_thread_get_priority(qurt_thread_get_id()); + + // If loading thread has priority less than 64, load static worker pool with 64 priority. + if(context == &static_context && prio < 64) prio = 64; + + if (prio < 1) prio = 1; + if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO; + + qurt_thread_attr_set_priority(&attr, prio); + + // launch + nErr = qurt_thread_create(&(me->thread[i]), &attr, worker_pool_main, (void *)me); + if (nErr) + { + FARF(ERROR, "Could not launch worker threads!"); + worker_pool_deinit((worker_pool_context_t*)&me); + return AEE_EQURTTHREADCREATE; + } + } + *context = (worker_pool_context_t*)me; + return AEE_SUCCESS; +} + +AEEResult worker_pool_init(worker_pool_context_t *context) +{ + return worker_pool_init_with_stack_size(context, WORKER_THREAD_STACK_SZ); +} + + +// clean up worker pool +void worker_pool_deinit(worker_pool_context_t *context) +{ + worker_pool_t *me = (worker_pool_t*)*context; + + // if no worker pool exists, return error. + if (NULL == me) + { + return; + } + + // de-initializations + (void) qurt_anysignal_set(&(me->empty_jobs), (1 << WORKER_KILL_SIGNAL)); // notify to stop new jobs. + (void) qurt_anysignal_set(&(me->queued_jobs), (1 << WORKER_KILL_SIGNAL)); // kill worker pool. + for (unsigned int i = 0; i < me->num_workers; i++) // wait for workers to die + { + if (me->thread[i]) + { + int status; + (void) qurt_thread_join(me->thread[i], &status); + } + } + + // release resources + qurt_mutex_destroy(&(me->empty_jobs_mutex)); + qurt_mutex_destroy(&(me->queued_jobs_mutex)); + qurt_anysignal_destroy(&(me->queued_jobs)); + qurt_anysignal_destroy(&(me->empty_jobs)); + // free allocated memory (were allocated as a single buffer starting at stack[0]) + if (me->stack[0]) free (me->stack[0]); + // Assign NULL to freed context so that further refence to it fails. + *context = NULL; +} + +// submit a job to the pool. +AEEResult worker_pool_submit(worker_pool_context_t context, worker_pool_job_t job) +{ + worker_pool_t *me = (worker_pool_t*)context; + + // if NULL is passed as worker_pool_context, try to use default static worker_pool + if (NULL == me) + { + if (static_context == NULL) + { + FARF(HIGH, "No default static worker pool found"); + return AEE_ERESOURCENOTFOUND; + } + FARF(MEDIUM, "Using default static worker pool"); + me = (worker_pool_t*)static_context; + } + + // if a worker thread tries to submit a job, call it in-context to avoid recursion deadlock. + unsigned int i; + qurt_thread_t id = qurt_thread_get_id(); + for (i = 0; i < me->num_workers; i++) + { + if (id == me->thread[i]) + { + job.fptr(job.dptr); // issue the callback in caller's context + return AEE_SUCCESS; + } + } + + // local vars to reduce dereferencing + qurt_mutex_t *mutex = &me->empty_jobs_mutex; + qurt_anysignal_t *signal = &me->empty_jobs; + unsigned int mask = me->job_queue_mask; + + qurt_mutex_lock(mutex); // lock empty queue + (void) qurt_anysignal_wait(signal, mask); // wait for an empty job node + unsigned int bitfield = qurt_anysignal_get(signal); + + // check if pool is being killed and return early + if (bitfield & (1 << WORKER_KILL_SIGNAL)) + { + qurt_mutex_unlock(mutex); + return AEE_ENOMORE; + } + + // send the job to the queue. + unsigned int sig_rx = Q6_R_ct0_R(mask & bitfield); // count trailing 0's to find first avail node + me->job[sig_rx] = job; // copy job descriptor + (void) qurt_anysignal_clear(signal, (1 << sig_rx)); // clear the empty job node flag + (void) qurt_anysignal_set(&me->queued_jobs, (1 << sig_rx)); // notify of pending job + qurt_mutex_unlock(mutex); // unlock the mutex + + return 0; +} + +void worker_pool_destructor() +{ + FARF(HIGH, "In worker_pool destructor"); + + worker_pool_deinit(&static_context); +} + +/*=========================================================================== + GLOBAL FUNCTION +===========================================================================*/ +// initialize a synctoken - caller will wait on the synctoken and each job will release it. +// caller wakes when all jobs have released. +void worker_pool_synctoken_init(worker_synctoken_t *token, unsigned int njobs) +{ + // cast input to usable struct + internal_synctoken_t *internal_token = (internal_synctoken_t *) token; + + // initialize atomic counter and semaphore + internal_token->sync.atomic_countdown = njobs; + qurt_sem_init_val(&internal_token->sync.sem, 0); +} + +// worker job responsible for calling this function to count down completed jobs. +void worker_pool_synctoken_jobdone(worker_synctoken_t *token) +{ + // cast input to usable struct + internal_synctoken_t *internal_token = (internal_synctoken_t *) token; + + // count down atomically, and raise semaphore if last job. + if (0 == worker_pool_atomic_dec_return(&internal_token->sync.atomic_countdown)) + { + (void) qurt_sem_up(&internal_token->sync.sem); + } +} + +// job submitter waits on this function for all jobs to complete. +void worker_pool_synctoken_wait(worker_synctoken_t *token) +{ + // cast input to usable struct + internal_synctoken_t *internal_token = (internal_synctoken_t *) token; + + // Wait for all jobs to finish and raise the semaphore + (void) qurt_sem_down(&internal_token->sync.sem); + + // clean up the semaphore + (void) qurt_sem_destroy(&internal_token->sync.sem); +} + +AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio) +{ + worker_pool_t *me = (worker_pool_t*)context; + + // if no worker pool exists, return error. + if (NULL == me) + { + return AEE_ENOMORE; + } + + int result = AEE_SUCCESS; + if (prio < 1) prio = 1; + if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO; + for (unsigned int i = 0; i < me->num_workers; i++) + { + int res = qurt_thread_set_priority(me->thread[i], (unsigned short)prio); + if (0 != res) + { + result = AEE_EBADPARM; + FARF(ERROR, "QURT failed to set priority of thread %d, ERROR = %d", me->thread[i], res); + } + } + return result; +} + +AEEResult worker_pool_retrieve_threadID(worker_pool_context_t context, unsigned int* threadIDs) { + + worker_pool_t *me = (worker_pool_t*)context; + if(me == NULL) + { + FARF(ERROR, "Context NULL in RetrieveThreadID"); + return AEE_EBADPARM;; + } + + for(int i=0; inum_workers; i++) + { + threadIDs[i]= me->thread[i]; + FARF(MEDIUM, "Inside RetrieveThreadID threadIDs[%d] is %d",i,threadIDs[i]); + } + return AEE_SUCCESS; +} + + +AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int *prio) +{ + worker_pool_t *me = (worker_pool_t*)context; + + // if NULL is passed as context, share static_context's priority. + if (NULL == me) + { + if (static_context == NULL) + return AEE_ENOMORE; + FARF(HIGH, "Using default static worker pool"); + me = (worker_pool_t*)static_context; + } + + int priority = qurt_thread_get_priority(me->thread[0]); + if (priority > 0) + { + *prio = priority; + return 0; + } + else + { + *prio = 0; + return AEE_EBADSTATE; + } +} diff --git a/ggml/src/ggml-hexagon/kernels/worker_pool.h b/ggml/src/ggml-hexagon/kernels/worker_pool.h new file mode 100755 index 0000000000000..701cbf6215f43 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/worker_pool.h @@ -0,0 +1,329 @@ +#ifndef WORKER_H +#define WORKER_H + +/**============================================================================= + +@file + worker_pool.h + +@brief + Utility providing a thread worker pool for multi-threaded computer vision + (or other compute) applications. + +Copyright (c) 2019-2020 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. + +Export of this technology or software is regulated by the U.S. +Government. Diversion contrary to U.S. law prohibited. + +All ideas, data and information contained in or disclosed by +this document are confidential and proprietary information of +Qualcomm Technologies Incorporated and all rights therein are expressly reserved. +By accepting this material the recipient agrees that this material +and the information contained therein are held in confidence and in +trust and will not be used, copied, reproduced in whole or in part, +nor its contents revealed in any manner to others without the express +written permission of Qualcomm Technologies Incorporated. + +=============================================================================**/ +//============================================================================== +// Defines +//============================================================================== +/// MACRO enables function to be visible in shared-library case. +#define WORKERPOOL_API __attribute__ ((visibility ("default"))) + +//============================================================================== +// Include Files +//============================================================================== + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*=========================================================================== + TYPEDEF +===========================================================================*/ +/// signature of callbacks to be invoked by worker threads +typedef void ( *worker_callback_t )( void* ); + +/// Typedef of worker_pool context +typedef void* worker_pool_context_t; + +/// descriptor for requested callback +typedef struct +{ + /// function pointer + worker_callback_t fptr; + /// data pointer + void* dptr; +} worker_pool_job_t; + +/// opaque client view of synchronization token for job submitter and workers. Internals hidden in implementation. +typedef struct +{ + /// opaque array to store synchronization token for job + unsigned int dummy[8]; // large enough to hold a counter and a semaphore +} worker_synctoken_t __attribute__((aligned(8))); + +/*=========================================================================== + CONSTANTS +===========================================================================*/ +/// Maximum supported number of worker threads. + +#define MAX_NUM_WORKERS 8 +/// Number of workers +WORKERPOOL_API extern unsigned int num_workers; +/// Maximum number of hvx 128 bytes units available +WORKERPOOL_API extern unsigned int num_hvx128_contexts; + +//============================================================================== +// Declarations +//============================================================================== + +//--------------------------------------------------------------------------- +/// @brief +/// Initialize a worker pool. Should be called by each control thread that +/// requires its own worker pool. +/// +/// +/// @param *context +/// pointer to worker_pool_context_t variable. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_init(worker_pool_context_t *context); + +//--------------------------------------------------------------------------- +/// @brief +/// Initialize a worker pool with custom stack size of worker threads. +// Should be called by each control thread that requires its own worker pool. +/// +/// +/// @param *context +/// pointer to worker_pool_context_t variable. +/// @param *stack_size +/// stack size of each worker thread. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_init_with_stack_size(worker_pool_context_t *context, int stack_size); + +//--------------------------------------------------------------------------- +/// @brief +/// Kill worker threads and release worker pool resources. Must be called +/// when pool owner no longer requires the pool. +/// +/// +/// @param *context +/// worker_pool_context_t. +/// +//--------------------------------------------------------------------------- +WORKERPOOL_API void +worker_pool_deinit(worker_pool_context_t *context); + +//--------------------------------------------------------------------------- +/// @brief +/// Function to determine if there is an established worker pool available to +/// the calling thread. This is an optional call - if no pool is available +/// but attempted to be used, everything works seamlessly, in the client's +/// context (instead of worker context). +/// +/// +/// @param context +/// worker_pool_context_t. +/// +/// @return +/// 0 - no worker pool available. +/// any other value - worker pool available. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_available(worker_pool_context_t context); + +//--------------------------------------------------------------------------- +/// @brief +/// Submit a job to the worker pool. +/// +/// +/// @param context +/// worker pool context where job is to be submitted. +/// +/// @param job +/// callback function pointer and data. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_submit(worker_pool_context_t context, worker_pool_job_t job); + +//--------------------------------------------------------------------------- +/// @brief +/// Initialize a synchronization token for job submitter and workers to use. +/// Each worker callback must be given access to the token to release it, and +/// job submitter will wait for all jobs to release the token. Internals are +/// hidden from client. +/// +/// +/// @param token +/// pointer to the synctoken structure. +/// +/// @param njobs +/// number of jobs that will be releasing the token +//--------------------------------------------------------------------------- +WORKERPOOL_API void +worker_pool_synctoken_init(worker_synctoken_t *token, unsigned int njobs); + +//--------------------------------------------------------------------------- +/// @brief +/// Needs to be called by the worker in the callback before exiting. The +/// token must be available to the callback via the data pointer given +/// to the callback during job submission. +/// +/// +/// @param token +/// pointer to the synctoken structure held by the job submitter +//--------------------------------------------------------------------------- +WORKERPOOL_API void +worker_pool_synctoken_jobdone(worker_synctoken_t *token); + +//--------------------------------------------------------------------------- +/// @brief +/// Job submitter calls this function after submitting all jobs to await +/// their completion. +/// +/// +/// @param token +/// pointer to the synctoken structure +//--------------------------------------------------------------------------- +WORKERPOOL_API void +worker_pool_synctoken_wait(worker_synctoken_t *token); + +//--------------------------------------------------------------------------- +/// @brief +/// Set the thread priority of the worker threads. Specified priority will +/// be applied to all threads in the default worker pool. The threads +/// that service boosted and background job requests will also be adjusted to be relative +/// to the new default thread priority. +/// +/// +/// @param context +/// worker pool context whose workers' priorities are to be changed. +/// +/// @param prio +/// desired priority. 1 is the highest priority allowed. 255 is the lowest priority allowed. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio); + +//--------------------------------------------------------------------------- +/// @brief +/// Query the thread priority of the default worker threads. This will return +/// the current priority for one of the workers, which are all created +/// with the same priority. If a user callback has changed one or more worker threads independently, +/// there is no guarantee on which worker's priority is returned by this function. +/// +/// +/// @param context +/// worker pool context whose workers' priorities are asked. +/// +/// @param prio +/// desired priority. 1 is the highest priority allowed. 255 is the lowest priority allowed. +/// +/// @return +/// 0 - success. +/// any other value - failure. +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int *prio); + +//--------------------------------------------------------------------------- +/// @brief +/// Utility inline to atomically increment a variable. Useful in +/// synchronizing jobs among worker threads, in cases where all +/// job-related info can be determined by the job number. +/// +/// +/// @param target +/// pointer to the variable being incremented +/// +/// @return +/// the value after incrementing +//--------------------------------------------------------------------------- +static inline unsigned int +worker_pool_atomic_inc_return(unsigned int *target) +{ + unsigned int result; + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + return result; +} + +//--------------------------------------------------------------------------- +/// @brief +/// Utility inline to atomically decrement a variable. +/// +/// +/// @param target +/// pointer to the variable being incremented +/// +/// @return +/// the value after decrementing +//--------------------------------------------------------------------------- +static inline unsigned int +worker_pool_atomic_dec_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + return result; +} + +//--------------------------------------------------------------------------- +/// @brief +/// Quries and retruns the threads IDs of all the active threads in the worker pool. +/// +/// +/// @param context +/// worker pool context whose workers' IDs are asked. +/// +/// @param threadIDs +/// pointer to the array created by the user where thread IDs will be written to. +/// +/// @return +/// 0 - success. +/// 0E - Invalid parameter +//--------------------------------------------------------------------------- +WORKERPOOL_API AEEResult +worker_pool_retrieve_threadID(worker_pool_context_t context, unsigned int* threadIDs); +#ifdef __cplusplus +} +#endif + +#endif // #ifndef WORKER_H diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index 1fe8fe3b8d079..e29df98560e07 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN) add_compile_definitions(GGML_HIP_ROCWMMA_FATTN) endif() +if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0) + add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12) +endif() + if (NOT GGML_CUDA_FA) add_compile_definitions(GGML_CUDA_NO_FA) endif() diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 89b59d9aadc7e..57761644f431a 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -32,6 +32,8 @@ extern "C" { #endif +void ggml_print_backtrace(void); + #ifndef MIN # define MIN(a, b) ((a) < (b) ? (a) : (b)) #endif @@ -315,203 +317,81 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); GGML_API void * ggml_aligned_malloc(size_t size); GGML_API void ggml_aligned_free(void * ptr, size_t size); -// FP16 to FP32 conversion - -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -// -// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 -// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 -// -#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - - #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - __fp16 tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - __fp16 tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; - } - -#elif defined(__F16C__) - - #ifdef _MSC_VER - #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) - #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) - #else - #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) - #endif - -#elif defined(__POWER9_VECTOR__) - - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - /* the inline asm below is about 12% faster than the lookup method */ - #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) - #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - float f; - double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - double d; - ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; - } - -#elif defined(__riscv) && defined(__riscv_zfhmin) +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - float f; - __asm__( - "fmv.h.x %[f], %[h]\n\t" - "fcvt.s.h %[f], %[f]" - : [f] "=&f" (f) - : [h] "r" (h) - ); - return f; - } +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - __asm__( - "fcvt.h.s %[f], %[f]\n\t" - "fmv.x.h %[h], %[f]" - : [h] "=&r" (res) - : [f] "f" (f) - ); - return res; - } +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) - #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float exp_scale = 0x1.0p-112f; #else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - // FP16 <-> FP32 - // ref: https://github.com/Maratyszcza/FP16 - - static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; - } - - static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; - } - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; - #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) - const float exp_scale = 0x1.0p-112f; - #else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); - #endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); - } + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; - #else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); - #endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); } - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) - -// precomputed f32 table for f16 (256 KB) -// defined in ggml.c, initialized in ggml_init() -GGML_API float ggml_table_f32_f16[1 << 16]; - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return ggml_table_f32_f16[s]; + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); } -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#endif +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -#endif /** * Converts brain16 to float32. diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index e222327809c31..77187efc1756d 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -44,21 +44,22 @@ if (GGML_METAL_EMBED_LIBRARY) set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp") add_custom_command( - OUTPUT ${METALLIB_EMBED_ASM} + OUTPUT "${METALLIB_EMBED_ASM}" COMMAND echo "Embedding Metal library" - COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED_TMP} - COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}' -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED} - COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} - COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} - COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} - COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} - COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} - COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}" + COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}" + COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}" + COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}" + COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}" + COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}" + COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}" + COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}" DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h COMMENT "Generate assembly for embedded Metal library" + VERBATIM ) - target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM}) + target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}") else() if (GGML_METAL_SHADER_DEBUG) # custom command to do the following: diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index f78e7eee553b6..d8d30cc0b41ca 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -48,22 +48,28 @@ int mtl_device_ref_count; id mtl_library; + NSLock * mtl_lock; + bool has_simdgroup_reduction; bool has_simdgroup_mm; bool has_residency_sets; bool has_bfloat; bool use_bfloat; + size_t max_size; + char name[128]; } g_ggml_ctx_dev_main = { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, /*.mtl_library =*/ nil, + /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, /*.has_residency_sets =*/ false, /*.has_bfloat =*/ false, /*.use_bfloat =*/ false, + /*.max_size =*/ 0, /*.name =*/ "", }; @@ -71,6 +77,10 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_device_context * ctx) { assert(ctx != NULL); + if (ctx->mtl_lock == nil) { + ctx->mtl_lock = [[NSLock alloc] init]; + } + if (ctx->mtl_device == nil) { ctx->mtl_device = MTLCreateSystemDefaultDevice(); } @@ -94,6 +104,8 @@ ctx->use_bfloat = false; #endif + ctx->max_size = ctx->mtl_device.maxBufferLength; + strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1); } @@ -110,6 +122,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_device_ref_count--; if (ctx->mtl_device_ref_count == 0) { + if (ctx->mtl_lock) { + [ctx->mtl_lock release]; + ctx->mtl_lock = nil; + } + if (ctx->mtl_library) { [ctx->mtl_library release]; ctx->mtl_library = nil; @@ -194,11 +211,14 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, @@ -498,6 +518,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_COS, GGML_METAL_KERNEL_TYPE_NEG, GGML_METAL_KERNEL_TYPE_SUM_ROWS, + GGML_METAL_KERNEL_TYPE_MEAN, GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, GGML_METAL_KERNEL_TYPE_ARGMAX, @@ -976,7 +997,7 @@ @implementation GGMLMetalClass struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); struct ggml_backend_metal_device_context * ctx_dev = dev->context; - id device = ggml_backend_metal_device_acq(ctx_dev); + id device = ctx_dev->mtl_device; GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); @@ -990,9 +1011,16 @@ @implementation GGMLMetalClass ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); // load library - if (ctx_dev->mtl_library == nil) { - ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat); + { + [ctx_dev->mtl_lock lock]; + + if (ctx_dev->mtl_library == nil) { + ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat); + } + + [ctx_dev->mtl_lock unlock]; } + id metal_library = ctx_dev->mtl_library; if (metal_library == nil) { GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__); @@ -1150,11 +1178,14 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, rwkv_wkv6_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, rwkv_wkv7_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4, mul_mv_f32_f32_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, mul_mv_bf16_f32, has_simdgroup_reduction && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4, mul_mv_bf16_f32_c4, use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, mul_mv_bf16_f32_1row, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, mul_mv_bf16_f32_l4, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, mul_mv_bf16_bf16, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4, mul_mv_f16_f32_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, has_simdgroup_reduction); @@ -1454,6 +1485,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true); @@ -1653,6 +1685,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_LOG: return false; // TODO: implement case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: case GGML_OP_SOFT_MAX: case GGML_OP_GROUP_NORM: return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]); @@ -2400,11 +2433,31 @@ static bool ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: { GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; + id pipeline = nil; + + switch (dst->op) { + case GGML_OP_SUM_ROWS: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; + break; + case GGML_OP_MEAN: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MEAN].pipeline; + break; + default: + GGML_ABORT("fatal error"); + } + + int nth = 32; // SIMD width + + while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) { + nth *= 2; + } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); + nth = MIN(nth, ne00); ggml_metal_kargs_sum_rows args = { /*.ne00 =*/ ne00, @@ -2434,11 +2487,12 @@ static bool ggml_metal_encode_node( }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&args length:sizeof(args) atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_SOFT_MAX: { @@ -3063,14 +3117,23 @@ static bool ggml_metal_encode_node( nsg = 1; nr0 = 1; nr1 = 4; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + if (ne00 == 4) { + nr0 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + } } break; case GGML_TYPE_F16: { nsg = 1; nr0 = 1; if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { + if (ne00 == 4) { + nr0 = 32; + nr1 = 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4].pipeline; + } else if (ne11 * ne12 < 4) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; @@ -3089,7 +3152,11 @@ static bool ggml_metal_encode_node( nsg = 1; nr0 = 1; if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { + if (ne00 == 4) { + nr0 = 32; + nr1 = 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4].pipeline; + } else if (ne11 * ne12 < 4) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; @@ -3733,6 +3800,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_rms_norm args = { @@ -3769,6 +3837,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_l2_norm args = { @@ -3841,6 +3910,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_norm args = { @@ -4766,6 +4836,8 @@ static bool ggml_metal_encode_node( GGML_ASSERT(nqptg % 8 == 0); GGML_ASSERT(ncpsg % 32 == 0); + const int is_q = ggml_is_quantized(src1->type) ? 1 : 0; + // 2*(2*ncpsg + nqptg)*(nsg) // ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float) // @@ -4773,7 +4845,7 @@ static bool ggml_metal_encode_node( // the shared memory needed for the simdgroups to load the KV cache // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG // -#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16)) +#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(2*ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + is_q*(16*32*(nsg)))*(sizeof(float)/2), 16)) int64_t nsgmax = 2; @@ -4810,9 +4882,9 @@ static bool ggml_metal_encode_node( // and store the soft_max values and the mask // // ne00*(nsg) - // each simdgroup has a full f16 head vector in shared mem to accumulate results + // each simdgroup has a full f32 head vector in shared mem to accumulate results // -#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16)) +#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*ne20*(nsg))*(sizeof(float)/2), 16)) int64_t nsgmax = 2; while (true) { @@ -4925,8 +4997,39 @@ static bool ggml_metal_encode_node( default: GGML_ABORT("not implemented"); } + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + // TODO: support + //const int32_t nk00 = ne00/ggml_blck_size(dst->type); + const int32_t nk00 = ne00; + + int nth = 32; // SIMD width + + while (nth < nk00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) { + nth *= 2; + } + + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); + + // when rows are small, we can batch them together in a single threadgroup + int nrptg = 1; + + // TODO: relax this constraint in the future + if (ggml_blck_size(src0->type) == 1 && ggml_blck_size(dst->type) == 1) { + if (nth > nk00) { + nrptg = (nth + nk00 - 1)/nk00; + nth = nk00; + + if (nrptg*nth > (int) pipeline.maxTotalThreadsPerThreadgroup) { + nrptg--; + } + } + } + + nth = MIN(nth, nk00); + ggml_metal_kargs_cpy args = { - /*.ne00 =*/ ne00, + /*.ne00 =*/ nk00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, /*.ne03 =*/ ne03, @@ -4949,11 +5052,7 @@ static bool ggml_metal_encode_node( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); - int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)]; } break; case GGML_OP_SET: { @@ -5259,7 +5358,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } ggml_backend_metal_buffer_rset_free(ctx); - ggml_backend_metal_device_rel(buffer->buft->device->context); if (ctx->owned) { #if TARGET_OS_OSX @@ -5368,7 +5466,10 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba } struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context; - id device = ggml_backend_metal_device_acq(ctx_dev); + + GGML_ASSERT(ctx_dev->mtl_device != nil); + + id device = ctx_dev->mtl_device; ctx->all_data = ggml_metal_host_malloc(size_aligned); ctx->all_size = size_aligned; @@ -5391,14 +5492,12 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); - ggml_backend_metal_device_rel(ctx_dev); return NULL; } if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); - ggml_backend_metal_device_rel(ctx_dev); return NULL; } @@ -5409,17 +5508,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; + GGML_UNUSED(buft); } static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - id device = ggml_backend_metal_device_acq(buft->device->context); - const size_t max_size = device.maxBufferLength; - ggml_backend_metal_device_rel(buft->device->context); + const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size; return max_size; - - GGML_UNUSED(buft); } static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { @@ -5492,7 +5588,10 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main; - id device = ggml_backend_metal_device_acq(ctx_dev); + + GGML_ASSERT(ctx_dev->mtl_device != nil); + + id device = ctx_dev->mtl_device; // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { @@ -5548,7 +5647,6 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); - ggml_backend_metal_device_rel(ctx_dev); return NULL; } @@ -5564,10 +5662,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } static void ggml_backend_metal_free(ggml_backend_t backend) { - struct ggml_backend_metal_context * ctx = backend->context; - struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + struct ggml_backend_metal_context * ctx = backend->context; - ggml_backend_metal_device_rel(ctx_dev); ggml_metal_free(ctx); free(backend); @@ -5707,6 +5803,8 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + GGML_ASSERT(ctx_dev->mtl_device != nil); + return [ctx_dev->mtl_device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } @@ -5726,10 +5824,7 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { } static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) { - // acq/rel just to populate ctx->name in case it hasn't been done yet struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; - ggml_backend_metal_device_acq(ctx_dev); - ggml_backend_metal_device_rel(ctx_dev); return ctx_dev->name; } @@ -5737,12 +5832,10 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { if (@available(macOS 10.12, iOS 16.0, *)) { struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; - id device = ggml_backend_metal_device_acq(ctx_dev); + id device = ctx_dev->mtl_device; *total = device.recommendedMaxWorkingSetSize; *free = *total - device.currentAllocatedSize; - - ggml_backend_metal_device_rel(ctx_dev); } else { *free = 1; *total = 1; @@ -5820,7 +5913,10 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back } struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context; - id device = ggml_backend_metal_device_acq(ctx_dev); + + GGML_ASSERT(ctx_dev->mtl_device != nil); + + id device = ctx_dev->mtl_device; // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { @@ -5876,7 +5972,6 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); free(ctx); - ggml_backend_metal_device_rel(ctx_dev); return NULL; } @@ -5890,8 +5985,9 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const } static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || - buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; + return + buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || + buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; GGML_UNUSED(dev); } @@ -5976,8 +6072,19 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r /* .get_proc_address = */ ggml_backend_metal_get_proc_address, }; +// called upon program exit +static void ggml_metal_cleanup(void) { + ggml_backend_metal_device_rel(&g_ggml_ctx_dev_main); +} + +// TODO: make thread-safe ggml_backend_reg_t ggml_backend_metal_reg(void) { - // TODO: make this thread-safe somehow? + ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main); + + // register cleanup callback + // TODO: not ideal, but not sure if there is a better way to do this in Objective-C + atexit(ggml_metal_cleanup); + { g_ggml_backend_metal_reg = (struct ggml_backend_reg) { /* .api_version = */ GGML_BACKEND_API_VERSION, diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 59899550ed38c..5f004a856bde6 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -993,31 +993,61 @@ kernel void kernel_neg( dst[tpig] = -src0[tpig]; } +template kernel void kernel_sum_rows( + constant ggml_metal_kargs_sum_rows & args, device const float * src0, device float * dst, - constant ggml_metal_kargs_sum_rows & args, - uint3 tpig[[thread_position_in_grid]]) { - int64_t i3 = tpig.z; - int64_t i2 = tpig.y; - int64_t i1 = tpig.x; + threadgroup float * shmem_f32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + int64_t i3 = tgpig.z; + int64_t i2 = tgpig.y; + int64_t i1 = tgpig.x; if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) { return; } + if (sgitg == 0) { + shmem_f32[tiisg] = 0.0f; + } + device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03); device float * dst_row = (device float *) ((device char *) dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3); - float row_sum = 0; + float sumf = 0; + + for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { + sumf += src_row[i0]; + } + + sumf = simd_sum(sumf); + + threadgroup_barrier(mem_flags::mem_threadgroup); - for (int64_t i0 = 0; i0 < args.ne00; i0++) { - row_sum += src_row[i0]; + if (tiisg == 0) { + shmem_f32[sgitg] = sumf; } - dst_row[0] = row_sum; + threadgroup_barrier(mem_flags::mem_threadgroup); + + sumf = shmem_f32[tiisg]; + sumf = simd_sum(sumf); + + if (tpitg.x == 0) { + dst_row[0] = norm ? sumf / args.ne00 : sumf; + } } +typedef decltype(kernel_sum_rows) kernel_sum_rows_t; + +template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows; +template [[host_name("kernel_mean")]] kernel kernel_sum_rows_t kernel_sum_rows; + template kernel void kernel_soft_max( device const char * src0, @@ -2502,6 +2532,70 @@ template [[host_name("kernel_mul_mv_bf16_f32")]] kernel mul_mv_t kernel_mul_mv< template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv; #endif +template +void kernel_mul_mv_c4_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig, + ushort tiisg) { + const int r0 = tgpig.x*32 + tiisg; + const int rb = tgpig.y*N_MV_T_T; + const int im = tgpig.z; + + if (r0 >= args.ne01) { + return; + } + + const uint i12 = im%args.ne12; + const uint i13 = im/args.ne12; + + const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; + + device const T04 * x = (device const T04 *) (src0 + offset0); + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1; + + for (int row = 0; row < N_MV_T_T; ++row) { + int r1 = rb + row; + if (r1 >= args.ne11) { + break; + } + + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const T14 * y = (device const T14 *) (src1 + offset1); + + dst_f32[(uint64_t)r1*args.ne0 + r0] = dot((float4) x[0], (float4) y[0]); + } +} + +template +kernel void kernel_mul_mv_c4( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_c4_impl( + args, + src0, + src1, + dst, + tgpig, + tiisg); +} + +typedef decltype(kernel_mul_mv_c4) mul_mv_c4_t; + +template [[host_name("kernel_mul_mv_f32_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +template [[host_name("kernel_mul_mv_f16_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +#if defined(GGML_METAL_USE_BF16) +template [[host_name("kernel_mul_mv_bf16_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +#endif + template kernel void kernel_mul_mv_1row( constant ggml_metal_kargs_mul_mv & args, @@ -3328,14 +3422,12 @@ kernel void kernel_flash_attn_ext( constexpr short NW = N_SIMDWIDTH; constexpr short SH = (2*C + Q); // shared memory per simdgroup (s_t == float) - const short TS = nsg*SH; // shared memory size per query in (s_t == float) - const short T = DK + 2*TS; // shared memory size per query in (half) + const short TS = nsg*SH; // shared memory size per query in (s_t == float) + const short T = 2*DK + 2*TS; // shared memory size per query in (half) - threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data - threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t - threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*DK); // reuse query data for accumulation - threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*DK); // same as above but in o4_t - threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + Q*DK); // scratch buffer for attention, mask and diagonal matrix + threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data + threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t + threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t @@ -3354,7 +3446,7 @@ kernel void kernel_flash_attn_ext( if (iq1 + j < args.ne01) { sq4[j*DK4 + i] = (q4_t) q4[i]; } else { - sq4[j*DK4 + i] = (q4_t) 0.0f; + sq4[j*DK4 + i] = 0; } } } @@ -3548,20 +3640,20 @@ kernel void kernel_flash_attn_ext( // O = diag(ms)*O { - s8x8_t mm; - simdgroup_load(mm, ss + 2*C, TS, 0, false); + s8x8_t ms; + simdgroup_load(ms, ss + 2*C, TS, 0, false); #pragma unroll(DV8) for (short i = 0; i < DV8; ++i) { - simdgroup_multiply(lo[i], mm, lo[i]); + simdgroup_multiply(lo[i], ms, lo[i]); } } // O = O + (Q*K^T)*V { for (short cc = 0; cc < C/8; ++cc) { - s8x8_t ms; - simdgroup_load(ms, ss + 8*cc, TS, 0, false); + s8x8_t vs; + simdgroup_load(vs, ss + 8*cc, TS, 0, false); if (is_same::value) { // we can read directly from global memory @@ -3572,7 +3664,7 @@ kernel void kernel_flash_attn_ext( v8x8_t mv; simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20 - simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]); + simdgroup_multiply_accumulate(lo[i], vs, mv, lo[i]); } } else { for (short ii = 0; ii < DV16; ii += 4) { @@ -3593,10 +3685,10 @@ kernel void kernel_flash_attn_ext( v8x8_t mv; simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false); - simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]); simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false); - simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]); } } else { if (ii + tx < DV16) { @@ -3611,10 +3703,10 @@ kernel void kernel_flash_attn_ext( v8x8_t mv; simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false); - simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]); simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false); - simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); + simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]); } } } @@ -3624,93 +3716,89 @@ kernel void kernel_flash_attn_ext( } // these are needed for reducing the results from the simdgroups (reuse the ss buffer) - for (short j = 0; j < Q; ++j) { - if (tiisg == 0) { - ss[j*TS + 0] = S[j]; - ss[j*TS + 1] = M[j]; - } + for (short j = tiisg; j < Q; j += NW) { + ss[j*TS + 0] = S[j]; + ss[j*TS + 1] = M[j]; } } - // reduce the warps sequentially - for (ushort sg = 1; sg < nsg; ++sg) { - float S = { 0.0f }; - float M = { -__FLT_MAX__/2 }; + threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup float * so = (threadgroup float *) (shmem_f16 + 0*DK); // reuse query data for accumulation + threadgroup float4 * so4 = (threadgroup float4 *) (shmem_f16 + 0*DK); - // each simdgroup stores its output to shared memory, reusing sq - if (sgitg == sg) { - for (short i = 0; i < DV8; ++i) { - simdgroup_store(lo[i], so + i*8, DV, 0, false); - } + // store result to shared memory in F32 + if (sgitg == 0) { + for (short i = 0; i < DV8; ++i) { + //simdgroup_store(lo[i], so + i*8, DV, 0, false); + simdgroup_float8x8 t(1.0f); + simdgroup_multiply(t, lo[i], t); + simdgroup_store(t, so + i*8, DV, 0, false); } + } - threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_threadgroup); - // the first simdgroup accumulates the results from the other simdgroups - if (sgitg == 0) { - for (short j = 0; j < Q; ++j) { - const float S0 = ss[j*TS + 0]; - const float S1 = ss[j*TS + sg*SH + 0]; + // reduce the warps sequentially + for (ushort sg = 1; sg < nsg; ++sg) { + if (sgitg == sg) { + for (short j = tiisg; j < Q; j += NW) { + const float S0 = ss[j*TS - 1*SH + 0]; + const float S1 = ss[j*TS + 0]; - const float M0 = ss[j*TS + 1]; - const float M1 = ss[j*TS + sg*SH + 1]; + const float M0 = ss[j*TS - 1*SH + 1]; + const float M1 = ss[j*TS + 1]; - M = max(M0, M1); + const float M = max(M0, M1); - const float ms0 = exp(M0 - M); - const float ms1 = exp(M1 - M); + float ms0 = exp(M0 - M); + float ms1 = exp(M1 - M); - S = S0*ms0 + S1*ms1; + const float S = S0*ms0 + S1*ms1; - if (tiisg == 0) { - ss[j*TS + 0] = S; - ss[j*TS + 1] = M; + ss[j*TS + 0] = S; + ss[j*TS + 1] = M; - ss[j*TS + 2*C + j ] = ms0; - ss[j*TS + 2*C + j + sg*SH] = ms1; - } + ss[j*TS + 2*C + j - 1*SH] = ms0; + ss[j*TS + 2*C + j ] = ms1; } + //simdgroup_barrier(mem_flags::mem_threadgroup); + // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 { s8x8_t ms0; s8x8_t ms1; - simdgroup_load(ms0, ss + 2*C, TS, 0, false); - simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false); + simdgroup_load(ms0, ss + 2*C - 1*SH, TS, 0, false); + simdgroup_load(ms1, ss + 2*C, TS, 0, false); #pragma unroll(DV8) for (short i = 0; i < DV8; ++i) { - o8x8_t t; + simdgroup_float8x8 t; simdgroup_load (t, so + i*8, DV, 0, false); - simdgroup_multiply(t, ms1, t); + simdgroup_multiply(t, ms0, t); - simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t); + simdgroup_multiply_accumulate(t, ms1, lo[i], t); + simdgroup_store(t, so + i*8, DV, 0, false); } } } - } - // store result to shared memory (reuse sq) - if (sgitg == 0) { - for (short i = 0; i < DV8; ++i) { - simdgroup_store(lo[i], so + i*8, DV, 0, false); - } + threadgroup_barrier(mem_flags::mem_threadgroup); } - device float4 * dst4 = (device float4 *) dst; + threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK); // final rescale with 1/S and store to global memory - if (sgitg == 0) { - for (short j = 0; j < Q && iq1 + j < args.ne01; ++j) { - const float S = ss[j*TS + 0]; + for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) { + const float S = 1.0f/sf[j*TS + 0]; - for (short i = tiisg; i < DV4; i += NW) { - dst4[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4 + i] = (float4) so4[j*DV4 + i]/S; - } + device float4 * dst4 = (device float4 *) dst + ((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4; + + for (short i = tiisg; i < DV4; i += NW) { + dst4[i] = (float4) so4[j*DV4 + i]*S; } } } @@ -3719,12 +3807,22 @@ kernel void kernel_flash_attn_ext( // template to be able to explore different combinations // #define FA_TYPES \ - half, half4, simdgroup_half8x8, \ - half, half4x4, simdgroup_half8x8, \ - half, half4x4, simdgroup_half8x8, \ - float, simdgroup_float8x8, \ - float, simdgroup_float8x8, \ - half, half4, simdgroup_half8x8 + float, float4, simdgroup_float8x8, \ + half, half4x4, simdgroup_half8x8, \ + half, half4x4, simdgroup_half8x8, \ + float, simdgroup_float8x8, \ + float, simdgroup_float8x8, \ + half, half4, simdgroup_half8x8 + //float, float4, simdgroup_float8x8 + +#define FA_TYPES_BF \ + bfloat, bfloat4, simdgroup_bfloat8x8, \ + bfloat, bfloat4x4, simdgroup_bfloat8x8, \ + bfloat, bfloat4x4, simdgroup_bfloat8x8, \ + float, simdgroup_float8x8, \ + float, simdgroup_float8x8, \ + half, half4, simdgroup_half8x8 + //float, float4, simdgroup_float8x8 typedef decltype(kernel_flash_attn_ext) flash_attn_ext_t; @@ -3739,15 +3837,15 @@ template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_f16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; #if defined(GGML_METAL_USE_BF16) -template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; -template [[host_name("kernel_flash_attn_ext_bf16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext; +template [[host_name("kernel_flash_attn_ext_bf16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; #endif template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext; @@ -3801,6 +3899,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_at template [[host_name("kernel_flash_attn_ext_q8_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext; #undef FA_TYPES +#undef FA_TYPES_BF template< typename q4_t, // query types in shared memory @@ -3847,12 +3946,12 @@ kernel void kernel_flash_attn_ext_vec( const short T = DK + nsg*SH; // shared memory size per query in (half) - //threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data - threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t - threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*DK); // scratch buffer for attention - threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*DK); // same as above but in s4_t - threadgroup float * sm = (threadgroup float *) (shmem_f16 + sgitg*SH + 2*C + Q*DK); // scratch buffer for mask - threadgroup o4_t * sr4 = (threadgroup o4_t *) (shmem_f16 + sgitg*DV + Q*T); // scratch buffer for the results + //threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data + threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t + threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*DK); // scratch buffer for attention + threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*DK); // same as above but in s4_t + threadgroup float * sm = (threadgroup float *) (shmem_f16 + sgitg*SH + 2*C + Q*DK); // scratch buffer for mask + threadgroup o4_t * sr4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*DV + Q*T); // scratch buffer for the results // store the result for all queries in local memory (the O matrix from the paper) o4_t lo[DV4/NL]; @@ -4157,7 +4256,7 @@ kernel void kernel_flash_attn_ext_vec( half4, \ float, \ float, float4, \ - half4 + float4 typedef decltype(kernel_flash_attn_ext_vec) flash_attn_ext_vec_t; @@ -4271,11 +4370,16 @@ kernel void kernel_cpy( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { + ushort3 tptg[[threads_per_threadgroup]]) { const int i03 = tgpig[2]; const int i02 = tgpig[1]; - const int i01 = tgpig[0]; + const int i01 = tgpig[0]*tptg.y + tiitg/tptg.x; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; @@ -4286,7 +4390,7 @@ kernel void kernel_cpy( device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) { + for (int64_t i00 = tiitg%tptg.x; i00 < args.ne00; i00 += tptg.x) { device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); dst_data[i00] = (T1) src[0]; } diff --git a/ggml/src/ggml-musa/mudnn.cuh b/ggml/src/ggml-musa/mudnn.cuh index a63be5755c79c..c30128561e810 100644 --- a/ggml/src/ggml-musa/mudnn.cuh +++ b/ggml/src/ggml-musa/mudnn.cuh @@ -1,7 +1,7 @@ #pragma once -#include "../include/ggml.h" -#include "../ggml-cuda/common.cuh" +#include "ggml-cuda/common.cuh" +#include "ggml.h" // Asynchronously copies data from src tensor to dst tensor using the provided context. // Returns a musaError_t indicating success or failure. diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 9f930c70b7bb4..0e2a419649cea 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS mul_mv_q4_0_f32_1d_8x_flat mul_mv_q4_0_f32_1d_16x_flat mul_mv_q6_k + mul_mv_id_q4_0_f32_8x_flat mul norm relu @@ -95,6 +96,12 @@ set(GGML_OPENCL_KERNELS sub sum_rows transpose + concat + tsembd + upscale + tanh + pad + repeat ) foreach (K ${GGML_OPENCL_KERNELS}) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 5dbe97ab2477d..96e8a8588dcb8 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive return { type, major, minor, patch }; } +// Profiling +struct ProfilingInfo { + std::string op_name; + std::string kernel_name; + + cl_kernel kernel; + cl_event evt; + + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong overhead_start; + cl_ulong overhead_end; + // For the times below, see spec for clGetEventProfilingInfo + // The time kernel spent in cmd queue - SUBMIT - QUEUED + cl_ulong cmd_queued_duration_ns; + // The time kernel spent for submission - START - SUBMIT + cl_ulong cmd_submit_duration_ns; + // Kernel execution time in nanoseconds - END - START + cl_ulong cmd_duration_ns; + // The time for the kernel to complete - COMPLETE - END + cl_ulong cmd_complete_duration_ns; + // Total time to finish the kernel - COMPELTE - QUEUED + cl_ulong cmd_total_duration_ns; + // Global and local work sizes. + size_t global_size[3]; + size_t local_size[3]; + // Op output size. + size_t output_size[4]; +}; + +static void populateProfilingInfo( + ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim, + size_t global_size[3], size_t local_size[3], + const ggml_tensor * tensor) { + info.op_name = tensor->name; + info.kernel = kernel; + info.evt = evt; + + // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose + info.local_size[0] = 0; + info.local_size[1] = 0; + info.local_size[2] = 0; + + info.global_size[0] = 0; + info.global_size[1] = 0; + info.global_size[2] = 0; + + if (local_size) { + for (cl_uint i = 0; i < work_dim; ++i) { + info.local_size[i] = local_size[i]; + } + } + + for (cl_uint i = 0; i < work_dim; ++i) { + info.global_size[i] = global_size[i]; + } + + info.output_size[0] = tensor->ne[0]; + info.output_size[1] = tensor->ne[1]; + info.output_size[2] = tensor->ne[2]; + info.output_size[3] = tensor->ne[3]; +} + struct ggml_backend_opencl_context; // backend device context @@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context { // backend context struct ggml_backend_opencl_context { + int ref_count; + cl_device_id device; std::string device_name; @@ -315,6 +382,13 @@ struct ggml_backend_opencl_context { cl_program program_softmax_4_f16; cl_program program_argsort_f32_i32; cl_program program_sum_rows_f32; + cl_program program_repeat; + cl_program program_pad; + cl_program program_tanh; + cl_program program_upscale; + cl_program program_concat; + cl_program program_tsembd; + cl_program program_mul_mv_id_q4_0_f32_8x_flat; cl_kernel kernel_add, kernel_add_row; cl_kernel kernel_mul, kernel_mul_row; @@ -351,6 +425,118 @@ struct ggml_backend_opencl_context { cl_kernel kernel_im2col_f32, kernel_im2col_f16; cl_kernel kernel_argsort_f32_i32; cl_kernel kernel_sum_rows_f32; + cl_kernel kernel_repeat; + cl_kernel kernel_pad; + cl_kernel kernel_tanh_f32_nd; + cl_kernel kernel_tanh_f16_nd; + cl_kernel kernel_upscale; + cl_kernel kernel_upscale_bilinear; + cl_kernel kernel_concat_f32_contiguous; + cl_kernel kernel_concat_f32_non_contiguous; + cl_kernel kernel_timestep_embedding; + cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat; + + std::vector profiling_info; + + void write_profiling_info() { + FILE * fperf = fopen("cl_profiling.csv", "w"); + if (!fperf) { + GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); + return; + } + + // Populate profiling info + for (ProfilingInfo & info : profiling_info) { + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong cmd_complete; + + CL_CHECK(clWaitForEvents(1, &info.evt)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); + CL_CHECK(clReleaseEvent(info.evt)); + + char kernel_name[512]; + CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, + sizeof(kernel_name), kernel_name, NULL)); + info.kernel_name = kernel_name; + + info.cmd_queued = cmd_queued; + info.cmd_submit = cmd_submit; + info.cmd_start = cmd_start; + info.cmd_end = cmd_end; + + info.cmd_queued_duration_ns = cmd_submit - cmd_queued; + info.cmd_submit_duration_ns = cmd_start - cmd_submit; + info.cmd_duration_ns = cmd_end - cmd_start; + info.cmd_complete_duration_ns = cmd_complete - cmd_end; + info.cmd_total_duration_ns = cmd_complete - cmd_queued; + } + + // Dump a csv + float total_kernel_time = 0; + fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); + for (const ProfilingInfo & info : profiling_info) { + total_kernel_time += info.cmd_duration_ns/1.e6f; + fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", + info.op_name.c_str(), info.kernel_name.c_str(), + info.cmd_queued_duration_ns/1.e6f, + info.cmd_submit_duration_ns/1.e6f, + info.cmd_duration_ns/1.e6f, + info.cmd_complete_duration_ns/1.e6f, + info.cmd_total_duration_ns/1.e6f, + info.global_size[0], info.global_size[1], info.global_size[2], + info.local_size[0], info.local_size[1], info.local_size[2], + info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); + } + fclose(fperf); + + GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); + + // Dump a simple chrome trace + FILE* ftrace = fopen("cl_trace.json", "w"); + if (!ftrace) { + GGML_LOG_ERROR("Failed to open cl_trace.json\n"); + return; + } + + fprintf(ftrace, "[\n"); + for (const ProfilingInfo & info : profiling_info) { + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_queued/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_submit/1000); + + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_start/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_end/1000); + } + fclose(ftrace); + } + + void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) { +#ifdef GGML_OPENCL_PROFILING + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + + profiling_info.emplace_back(); + populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); +#else + GGML_UNUSED(tensor); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); +#endif + } #ifdef GGML_OPENCL_USE_ADRENO_KERNELS // Transpose kernels @@ -378,46 +564,19 @@ struct ggml_backend_opencl_context { cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; #endif // GGML_OPENCL_USE_ADRENO_KERNELS -}; - -// All registered devices with a default device in the front. -static std::vector g_ggml_backend_opencl_devices; -// Profiling + void free() { + ref_count--; + if (ref_count == 0) { #ifdef GGML_OPENCL_PROFILING -struct ProfilingInfo { - std::string op_name; - std::string kernel_name; - - cl_kernel kernel; - cl_event evt; - - cl_ulong cmd_queued; - cl_ulong cmd_submit; - cl_ulong cmd_start; - cl_ulong cmd_end; - cl_ulong overhead_start; - cl_ulong overhead_end; - // For the times below, see spec for clGetEventProfilingInfo - // The time kernel spent in cmd queue - SUBMIT - QUEUED - cl_ulong cmd_queued_duration_ns; - // The time kernel spent for submission - START - SUBMIT - cl_ulong cmd_submit_duration_ns; - // Kernel execution time in nanoseconds - END - START - cl_ulong cmd_duration_ns; - // The time for the kernel to complete - COMPLETE - END - cl_ulong cmd_complete_duration_ns; - // Total time to finish the kernel - COMPELTE - QUEUED - cl_ulong cmd_total_duration_ns; - // Global and local work sizes. - size_t global_size[3]; - size_t local_size[3]; - // Op output size. - size_t output_size[4]; + write_profiling_info(); +#endif + } + } }; -std::vector g_profiling_info; -#endif +// All registered devices with a default device in the front. +static std::vector g_ggml_backend_opencl_devices; inline std::string read_file(const std::string &path) { std::ifstream ifs(path); @@ -1097,6 +1256,166 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // repeat + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "repeat.cl.h" + }; +#else + const std::string kernel_src = read_file("repeat.cl"); +#endif + if (!kernel_src.empty()) { + backend_ctx->program_repeat = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err)); + GGML_LOG_CONT("."); + } else { + GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n"); + backend_ctx->program_repeat = nullptr; + backend_ctx->kernel_repeat = nullptr; + } + } + + // pad + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "pad.cl.h" + }; +#else + const std::string kernel_src = read_file("pad.cl"); +#endif + if (!kernel_src.empty()) { + backend_ctx->program_pad = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err)); + GGML_LOG_CONT("."); + } else { + GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n"); + backend_ctx->program_pad = nullptr; + backend_ctx->kernel_pad = nullptr; + } + } + + // tanh + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "tanh.cl.h" + }; +#else + const std::string kernel_src = read_file("tanh.cl"); +#endif + if (!kernel_src.empty()) { + backend_ctx->program_tanh = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err)); + CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err)); + GGML_LOG_CONT("."); + } else { + GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n"); + backend_ctx->program_tanh = nullptr; + backend_ctx->kernel_tanh_f32_nd = nullptr; + backend_ctx->kernel_tanh_f16_nd = nullptr; + } + } + + // upscale + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "upscale.cl.h" + }; +#else + const std::string kernel_src = read_file("upscale.cl"); +#endif + if (!kernel_src.empty()) { + backend_ctx->program_upscale = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err)); + if (backend_ctx->program_upscale) { + cl_int err_bilinear; + backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear); + if (err_bilinear != CL_SUCCESS) { + GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear); + backend_ctx->kernel_upscale_bilinear = nullptr; + } + } else { + backend_ctx->kernel_upscale_bilinear = nullptr; + } + GGML_LOG_CONT("."); + } else { + GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n"); + backend_ctx->program_upscale = nullptr; + backend_ctx->kernel_upscale = nullptr; + backend_ctx->kernel_upscale_bilinear = nullptr; + } + } + + // concat + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "concat.cl.h" + }; +#else + + const std::string kernel_src = read_file("concat.cl"); +#endif + if (!kernel_src.empty()) { + backend_ctx->program_concat = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err)); + CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err)); + GGML_LOG_CONT("."); + } else { + GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n"); + backend_ctx->program_concat = nullptr; + backend_ctx->kernel_concat_f32_contiguous = nullptr; + backend_ctx->kernel_concat_f32_non_contiguous = nullptr; + } + } + + // timestep_embedding + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "tsembd.cl.h" + }; +#else + + const std::string kernel_src = read_file("tsembd.cl"); +#endif + if (!kernel_src.empty()) { + backend_ctx->program_tsembd = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err)); + GGML_LOG_CONT("."); + } else { + GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n"); + backend_ctx->program_tsembd = nullptr; + backend_ctx->kernel_timestep_embedding = nullptr; + } + } + + // mul_mv_id_q4_0_f32_8x_flat + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mv_id_q4_0_f32_8x_flat.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl"); +#endif + backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err)); + GGML_LOG_CONT("."); + } + // Adreno kernels #ifdef GGML_OPENCL_USE_ADRENO_KERNELS // transpose @@ -1492,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { backend_ctx->device = dev_ctx->device; backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; + // ref_count get increased in ggml_backend_opencl_device_init + // This function is also used to retrieve backend context, so we don't want + // to increase ref_count for each call. We only want to increase ref_count + // when the associated device is initialized + backend_ctx->ref_count = 0; + if (strstr(dev_ctx->device_name.c_str(), "Adreno") || strstr(dev_ctx->device_name.c_str(), "Qualcomm") || strstr(dev_ctx->device_version.c_str(), "Adreno")) { @@ -1664,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { return dev_ctx->backend_ctx; } -static void ggml_cl2_free(void) { -#ifdef GGML_OPENCL_PROFILING - FILE * fperf = fopen("cl_profiling.csv", "w"); - if (!fperf) { - GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); - return; - } +static void ggml_cl2_free(ggml_backend_t backend) { + ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context; + ctx->free(); - // Populate profiling info - for (ProfilingInfo & info : g_profiling_info) { - cl_ulong cmd_queued; - cl_ulong cmd_submit; - cl_ulong cmd_start; - cl_ulong cmd_end; - cl_ulong cmd_complete; - - CL_CHECK(clWaitForEvents(1, &info.evt)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); - CL_CHECK(clReleaseEvent(info.evt)); - - char kernel_name[512]; - CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, - sizeof(kernel_name), kernel_name, NULL)); - info.kernel_name = kernel_name; - - info.cmd_queued = cmd_queued; - info.cmd_submit = cmd_submit; - info.cmd_start = cmd_start; - info.cmd_end = cmd_end; - - info.cmd_queued_duration_ns = cmd_submit - cmd_queued; - info.cmd_submit_duration_ns = cmd_start - cmd_submit; - info.cmd_duration_ns = cmd_end - cmd_start; - info.cmd_complete_duration_ns = cmd_complete - cmd_end; - info.cmd_total_duration_ns = cmd_complete - cmd_queued; - } - - // Dump a csv - float total_kernel_time = 0; - fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); - for (const ProfilingInfo & info : g_profiling_info) { - total_kernel_time += info.cmd_duration_ns/1.e6f; - fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", - info.op_name.c_str(), info.kernel_name.c_str(), - info.cmd_queued_duration_ns/1.e6f, - info.cmd_submit_duration_ns/1.e6f, - info.cmd_duration_ns/1.e6f, - info.cmd_complete_duration_ns/1.e6f, - info.cmd_total_duration_ns/1.e6f, - info.global_size[0], info.global_size[1], info.global_size[2], - info.local_size[0], info.local_size[1], info.local_size[2], - info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); - } - fclose(fperf); - - GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); - - // Dump a simple chrome trace - FILE* ftrace = fopen("cl_trace.json", "w"); - if (!ftrace) { - GGML_LOG_ERROR("Failed to open cl_trace.json\n"); - return; + // The CL context is shared by all backends, release it if all backends have been released + bool should_release_opencl = true; + for (auto device : g_ggml_backend_opencl_devices) { + ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context; + if (ctx_dev->backend_ctx->ref_count > 0) { + should_release_opencl = false; + } } - fprintf(ftrace, "[\n"); - for (const ProfilingInfo & info : g_profiling_info) { - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", - info.kernel_name.c_str(), info.cmd_queued/1000); - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", - info.kernel_name.c_str(), info.cmd_submit/1000); - - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", - info.kernel_name.c_str(), info.cmd_start/1000); - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", - info.kernel_name.c_str(), info.cmd_end/1000); + if (should_release_opencl) { + CL_CHECK(clReleaseContext(ctx->context)); } - fclose(ftrace); -#endif } //------------------------------------------------------------------------------ @@ -1834,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) { } static void ggml_backend_opencl_free(ggml_backend_t backend) { - ggml_cl2_free(); - - GGML_UNUSED(backend); + ggml_cl2_free(backend); } static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -1863,7 +2115,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g } static void ggml_backend_opencl_synchronize(ggml_backend_t backend) { - GGML_UNUSED(backend); + auto * backend_ctx = static_cast(backend->context); + + cl_event evt; + CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseEvent(evt)); } // Syncronizes the 'backend_ctx's device with others so that commands @@ -1976,9 +2233,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_GELU_QUICK: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; case GGML_UNARY_OP_SIGMOID: return ggml_is_contiguous(op->src[0]); + case GGML_UNARY_OP_TANH: + return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) || + (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16); default: return false; } @@ -1988,6 +2248,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_OP_NORM: case GGML_OP_RMS_NORM: return true; + case GGML_OP_REPEAT: + return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded + case GGML_OP_PAD: + return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 && + op->src[0]->ne[3] == 1 && op->ne[3] == 1; + case GGML_OP_UPSCALE: + return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + case GGML_OP_CONCAT: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + case GGML_OP_TIMESTEP_EMBEDDING: + return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; case GGML_OP_GROUP_NORM: return ggml_is_contiguous(op->src[0]); case GGML_OP_MUL_MAT: @@ -2000,6 +2271,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); } return false; + case GGML_OP_MUL_MAT_ID: + if (op->src[0]->type == GGML_TYPE_Q4_0) { + if (op->src[1]->type == GGML_TYPE_F32) { + return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); + } + } + return false; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: @@ -2052,7 +2330,7 @@ static ggml_backend_i ggml_backend_opencl_i = { /* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */ /* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */ /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */ - /* .synchronize = */ NULL, /* ggml_backend_opencl_synchronize */ + /* .synchronize = */ ggml_backend_opencl_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_update = */ NULL, @@ -2696,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) { ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev); + // Getting a new reference to the backend, increase ref_count + backend_ctx->ref_count++; ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_opencl_guid(), @@ -2956,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso #define dump_tensor(tensor) #endif -//------------------------------------------------------------------------------ -// Profiling utility -//------------------------------------------------------------------------------ -#ifdef GGML_OPENCL_PROFILING -static void populateProfilingInfo( - ProfilingInfo& info, cl_event evt, cl_kernel kernel, - size_t global_size[3], size_t local_size[3], - const ggml_tensor * tensor) { - info.op_name = tensor->name; - info.kernel = kernel; - info.evt = evt; - - info.local_size[0] = local_size[0]; - info.local_size[1] = local_size[1]; - info.local_size[2] = local_size[2]; - info.global_size[0] = global_size[0]; - info.global_size[1] = global_size[1]; - info.global_size[2] = global_size[2]; - info.output_size[0] = tensor->ne[0]; - info.output_size[1] = tensor->ne[1]; - info.output_size[2] = tensor->ne[2]; - info.output_size[3] = tensor->ne[3]; -} -#endif - //------------------------------------------------------------------------------ // Ops //------------------------------------------------------------------------------ @@ -3024,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c const cl_ulong nb2 = dst ? dst->nb[2] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3068,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1}; size_t local_work_size[] = {1, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3118,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst ? dst->nb[3] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3193,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3258,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst ? dst->nb[3] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3333,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3395,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3458,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3520,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3583,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3618,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3645,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3665,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3692,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3712,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3744,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3764,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3789,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3809,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3841,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3861,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3893,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3913,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3954,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3974,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; //ggml_backend_opencl_device_context * dev_ctx = // (ggml_backend_opencl_device_context *)backend->device->context; @@ -4038,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c // This is local memory - the size depends on subgroup size. CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL)); -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4058,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4097,15 +4202,487 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1}; size_t local_work_size[] = {(size_t)sgs, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif +static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + UNUSED(src1); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0_abs = extra0->offset + src0->view_offs; + cl_ulong offsetd_abs = extrad->offset + dst->view_offs; + + cl_kernel kernel; + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_tanh_f32_nd; + } else if (dst->type == GGML_TYPE_F16) { + kernel = backend_ctx->kernel_tanh_f16_nd; + } else { + GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh"); + } + GGML_ASSERT(kernel != nullptr); + + const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; + const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3]; + + const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3]; + const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3]; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs)); + + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03)); + + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13)); + + size_t global_work_size[3]; + if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements + return; + } + global_work_size[0] = (size_t)ne10; + global_work_size[1] = (size_t)ne11; + global_work_size[2] = (size_t)ne12; + + size_t lws0 = 16, lws1 = 4, lws2 = 1; + if (ne10 < 16) lws0 = ne10; + if (ne11 < 4) lws1 = ne11; + if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1; + + while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2; + while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2; + while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2; + + + size_t local_work_size[] = {lws0, lws1, lws2}; + + size_t* local_work_size_ptr = local_work_size; + if (!backend_ctx->non_uniform_workgroups) { + if (global_work_size[0] % local_work_size[0] != 0 || + global_work_size[1] % local_work_size[1] != 0 || + global_work_size[2] % local_work_size[2] != 0) { + local_work_size_ptr = NULL; + } + } + if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); +} + +static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + GGML_ASSERT(dst->type == src0->type); + + UNUSED(src1_shape_def); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + if (backend_ctx->kernel_repeat == nullptr) { + GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__); + return; + } + + ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong off_src0 = extra_src0->offset + src0->view_offs; + cl_ulong off_dst = extra_dst->offset + dst->view_offs; + + const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3]; + const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3]; + + const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3]; + const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3]; + + cl_kernel kernel = backend_ctx->kernel_repeat; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3)); + + size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1; + size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1; + size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1; + + size_t global_work_size[] = { gws0, gws1, gws2 }; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); +} + +static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + if (backend_ctx->kernel_pad == nullptr) { + GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__); + return; + } + + ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong off_src0 = extra_src0->offset + src0->view_offs; + cl_ulong off_dst = extra_dst->offset + dst->view_offs; + + const int s_ne0 = src0->ne[0]; + const int s_ne1 = src0->ne[1]; + const int s_ne2 = src0->ne[2]; + + const int d_ne0 = dst->ne[0]; + const int d_ne1 = dst->ne[1]; + const int d_ne2 = dst->ne[2]; + + cl_kernel kernel = backend_ctx->kernel_pad; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2)); + + size_t lws0 = 64; + size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0; + + size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 }; + size_t local_work_size[] = { lws0, 1, 1 }; + + size_t * local_work_size_ptr = local_work_size; + if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; + } + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); +} + +static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); + cl_kernel kernel = nullptr; + + if (mode == GGML_SCALE_MODE_NEAREST) { + kernel = backend_ctx->kernel_upscale; + if (kernel == nullptr) { + GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__); + return; + } + } else if (mode == GGML_SCALE_MODE_BILINEAR) { + kernel = backend_ctx->kernel_upscale_bilinear; + if (kernel == nullptr) { + GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__); + return; + } + } else { + GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode); + return; + } + + ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong off_src0 = extra_src0->offset + src0->view_offs; + cl_ulong off_dst = extra_dst->offset + dst->view_offs; + + const cl_ulong nb00 = src0->nb[0]; + const cl_ulong nb01 = src0->nb[1]; + const cl_ulong nb02 = src0->nb[2]; + const cl_ulong nb03 = src0->nb[3]; + + const int ne00_src = src0->ne[0]; + const int ne01_src = src0->ne[1]; + + const int ne10_dst = dst->ne[0]; + const int ne11_dst = dst->ne[1]; + const int ne12_dst = dst->ne[2]; + const int ne13_dst = dst->ne[3]; + + const float sf0 = (float)dst->ne[0] / src0->ne[0]; + const float sf1 = (float)dst->ne[1] / src0->ne[1]; + const float sf2 = (float)dst->ne[2] / src0->ne[2]; + const float sf3 = (float)dst->ne[3] / src0->ne[3]; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03)); + + if (mode == GGML_SCALE_MODE_NEAREST) { + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3)); + } else if (mode == GGML_SCALE_MODE_BILINEAR) { + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3)); + } + + + size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst; + if (dst_total_elements == 0) { + return; + } + size_t global_work_size[] = { dst_total_elements, 1, 1 }; + size_t local_work_size_pref = 256; + size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1}; + + size_t * local_work_size_ptr = local_work_size; + if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; + } + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); +} + +static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + cl_command_queue queue = backend_ctx->queue; + + if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) { + GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__); + return; + } + + ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong off_src0 = extra0_cl->offset + src0->view_offs; + cl_ulong off_src1 = extra1_cl->offset + src1->view_offs; + cl_ulong off_dst = extrad_cl->offset + dst->view_offs; + + const int32_t dim = ((const int32_t *) dst->op_params)[0]; + GGML_ASSERT(dim >= 0 && dim <= 3); + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { + if (dim == 3) { + + size_t nbytes_src0 = ggml_nbytes(src0); + size_t nbytes_src1 = ggml_nbytes(src1); + + CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device, + off_src0, off_dst, nbytes_src0, 0, NULL, NULL)); + CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device, + off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL)); + } else { + + cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous; + size_t global_work_size[3]; + + for (int i3 = 0; i3 < dst->ne[3]; ++i3) { + cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]); + cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]); + cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]); + + int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2]; + int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2]; + int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2]; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), ¤t_off_src0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), ¤t_off_src1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), ¤t_off_dst)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim)); + + global_work_size[0] = d_ne0; + global_work_size[1] = d_ne1; + global_work_size[2] = d_ne2; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); + } + } + } else { + cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous; + + long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3]; + cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3]; + + cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3]; + + long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3]; + cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3]; + + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst)); + + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); + + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13)); + + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2)); + CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3)); + CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0)); + CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1)); + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2)); + CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3)); + CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim)); + + size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1, + d_ne2 > 0 ? (size_t)d_ne2 : 1, + d_ne3 > 0 ? (size_t)d_ne3 : 1 }; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst); + } +} + +static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + if (backend_ctx->kernel_timestep_embedding == nullptr) { + GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__); + return; + } + + ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong off_src0 = extra_src0->offset + src0->view_offs; + cl_ulong off_dst = extra_dst->offset + dst->view_offs; + + const int logical_dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + const int dst_nb1_bytes = dst->nb[1]; + + cl_kernel kernel = backend_ctx->kernel_timestep_embedding; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period)); + + size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1); + + size_t gws1 = (size_t)src0->ne[0]; + + size_t global_work_size[] = {gws0, gws1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4120,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -4325,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co static_cast(padded_height_B) }; - #ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst); - #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL)); - #endif + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst); } else { // no need to transpose B in other cases // create an image for B from sub_buffer @@ -4455,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co // enqueue kernel with profiling // <--------------------------------------------> // - #ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); - // enqueue kernel without profiling - #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); - #endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); // <--------------------------------------------> // // deallocate sub buffers and images @@ -4544,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co global_work_size[2] = (size_t)ne12*ne13; } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); return; } #else // GGML_OPENCL_SOA_Q @@ -4782,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else if (src0t == GGML_TYPE_Q4_K) { GGML_ASSERT(false && "not implemented"); } else if (src0t == GGML_TYPE_Q3_K) { @@ -4801,31 +5344,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { int64_t ny = (ne11 + nrows - 1)/nrows; size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } +} - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); +static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + const ggml_tensor * src2 = dst->src[2]; + GGML_ASSERT(src2); + GGML_ASSERT(src2->extra); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offset2 = extra2->offset + src2->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + +#ifdef GGML_OPENCL_SOA_Q + ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra; #endif + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const cl_ulong nb00 = src0->nb[0]; + const cl_ulong nb02 = src0->nb[2]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + + const cl_ulong nb11 = src1->nb[1]; + const cl_ulong nb12 = src1->nb[2]; + + const int ne20 = src2->ne[0]; + const int ne21 = src2->ne[1]; + + const cl_ulong nb21 = src2->nb[1]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + + const int r2 = ne12/ne02; + const int r3 = ne13/ne03; + const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows + + GGML_ASSERT(ne00 == ne10); + + int sgs = 32; // subgroup size + int nsg = 1; // number of subgroups + int nrows = 1; // number of row in src1 + int ndst = 4; // number of values produced by each subgroup + + cl_kernel kernel; + + // subgroup mat vec + switch (src0->type) { + case GGML_TYPE_Q4_0: { + kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat; + + if (backend_ctx->gpu_family == INTEL) { + sgs = 16; + nsg = 1; + ndst = 8; + } else if (backend_ctx->gpu_family == ADRENO) { + sgs = 64; + nsg = 1; + ndst = 8; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21)); + CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3)); + + break; + } + default: + GGML_ASSERT(false && "not implemented");; } + + int _ne1 = 1; + int ne123 = dst_rows; + + size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123}; + size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4838,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; float scale; memcpy(&scale, dst->op_params, sizeof(scale)); @@ -4867,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4912,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -4977,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1); } static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5008,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr const int ne02 = src0 ? src0->ne[2] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -5032,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { kernel = backend_ctx->kernel_diag_mask_inf; @@ -5060,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } } @@ -5088,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c } ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -5168,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5188,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const GGML_ASSERT(dst->extra); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -5354,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5377,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -5446,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC}; size_t local_work_size[] = {256, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5469,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -5501,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1}; size_t local_work_size[] = {(size_t)ne00_padded, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5523,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -5564,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } //------------------------------------------------------------------------------ @@ -5667,6 +6235,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_sigmoid; break; + case GGML_UNARY_OP_TANH: + if (!any_on_device) { + return false; + } + func = ggml_cl_tanh; + break; default: return false; } break; @@ -5694,12 +6268,48 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_group_norm; break; + case GGML_OP_REPEAT: + if (!any_on_device) { + return false; + } + func = ggml_cl_repeat; + break; + case GGML_OP_PAD: + if (!any_on_device) { + return false; + } + ggml_cl_pad(backend, tensor->src[0], tensor); + return true; + case GGML_OP_UPSCALE: + if (!any_on_device) { + return false; + } + ggml_cl_upscale(backend, tensor->src[0], tensor); + return true; + case GGML_OP_CONCAT: + if (!any_on_device) { + return false; + } + func = ggml_cl_concat; + break; + case GGML_OP_TIMESTEP_EMBEDDING: + if (!any_on_device) { + return false; + } + ggml_cl_timestep_embedding(backend, tensor->src[0], tensor); + return true; case GGML_OP_MUL_MAT: if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { return false; } func = ggml_cl_mul_mat; break; + case GGML_OP_MUL_MAT_ID: + if (!any_on_device) { + return false; + } + func = ggml_cl_mul_mat_id; + break; case GGML_OP_SCALE: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/concat.cl b/ggml/src/ggml-opencl/kernels/concat.cl new file mode 100644 index 0000000000000..132758469c6fa --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/concat.cl @@ -0,0 +1,109 @@ +kernel void kernel_concat_f32_contiguous( + global const char * p_src0, ulong off_src0, + global const char * p_src1, ulong off_src1, + global char * p_dst, ulong off_dst, + int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice + int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes) + int d_ne0, int d_ne1, int d_ne2, // dst->ne[0..2] for the slice + int dim +) { + global const float * src0 = (global const float*)((global char*)p_src0 + off_src0); + global const float * src1 = (global const float*)((global char*)p_src1 + off_src1); + global float * dst = (global float*)((global char*)p_dst + off_dst); + + int i0 = get_global_id(0); // Index along dst's 0th dimension + int i1 = get_global_id(1); // Index along dst's 1st dimension + int i2 = get_global_id(2); // Index along dst's 2nd dimension + + if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) { + return; + } + + ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0; + ulong src_idx; + + if (dim == 0) { + if (i0 < d_ne00) { // Data from src0 + src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0; + dst[dst_idx] = src0[src_idx]; + } else { // Data from src1 + src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00); + dst[dst_idx] = src1[src_idx]; + } + } else if (dim == 1) { + if (i1 < d_ne01) { // Data from src0 + src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0; + dst[dst_idx] = src0[src_idx]; + } else { // Data from src1 + src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0; + dst[dst_idx] = src1[src_idx]; + } + } else if (dim == 2) { + if (i2 < d_ne02) { // Data from src0 + src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0; + dst[dst_idx] = src0[src_idx]; + } else { // Data from src1 + + src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0; + dst[dst_idx] = src1[src_idx]; + } + } +} + +kernel void kernel_concat_f32_non_contiguous( + global const char * p_src0, ulong off_src0, + global const char * p_src1, ulong off_src1, + global char * p_dst, ulong off_dst, + + long ne00, long ne01, long ne02, long ne03, + ulong nb00, ulong nb01, ulong nb02, ulong nb03, + + ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1 + + long d_ne0, long d_ne1, long d_ne2, long d_ne3, + ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3, + int dim +) { + global const char * src0_base = p_src0 + off_src0; + global const char * src1_base = p_src1 + off_src1; + global char * dst_base = p_dst + off_dst; + + long current_i1 = get_global_id(0); // Index for dst_dim_1 + long current_i2 = get_global_id(1); // Index for dst_dim_2 + long current_i3 = get_global_id(2); // Index for dst_dim_3 + + if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) { + return; + } + + global const float * x_val_ptr; + global float * y_val_ptr; + + for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) { + bool use_src0; + long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3; + + if (dim == 0) { + use_src0 = (current_i0 < ne00); + if (!use_src0) { s_i0 = current_i0 - ne00; } + } else if (dim == 1) { + use_src0 = (current_i1 < ne01); + if (!use_src0) { s_i1 = current_i1 - ne01; } + } else if (dim == 2) { + use_src0 = (current_i2 < ne02); + if (!use_src0) { s_i2 = current_i2 - ne02; } + } else { // dim == 3 + use_src0 = (current_i3 < ne03); + if (!use_src0) { s_i3 = current_i3 - ne03; } + } + + if (use_src0) { + x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00); + } else { + x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10); + } + + y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0); + *y_val_ptr = *x_val_ptr; + } +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl new file mode 100644 index 0000000000000..7ccf41efbe918 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl @@ -0,0 +1,283 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#define QK4_0 32 + +typedef char int8_t; +typedef uchar uint8_t; +typedef short int16_t; +typedef ushort uint16_t; +typedef int int32_t; +typedef uint uint32_t; + +//------------------------------------------------------------------------------ +// block_q4_0 +//------------------------------------------------------------------------------ +struct block_q4_0 +{ + half d; + uint8_t qs[QK4_0 / 2]; +}; + +// This function requires the original shuffled weights. +// As a reminder, the original weights are shuffled so that (q[0], q[16]) are +// packed together in a byte, so are (q[1], q[17]) and so on. +inline float block_q_4_0_dot_y_flat( + global uchar * x, + global half * dh, + float sumy, + float16 yl, + int il +) { + float d = *dh; + global ushort * qs = ((global ushort *)x + il/2); + float acc = 0.f; + + acc += yl.s0 * (qs[0] & 0x000F); + acc += yl.s1 * (qs[0] & 0x0F00); + acc += yl.s8 * (qs[0] & 0x00F0); + acc += yl.s9 * (qs[0] & 0xF000); + + acc += yl.s2 * (qs[1] & 0x000F); + acc += yl.s3 * (qs[1] & 0x0F00); + acc += yl.sa * (qs[1] & 0x00F0); + acc += yl.sb * (qs[1] & 0xF000); + + acc += yl.s4 * (qs[2] & 0x000F); + acc += yl.s5 * (qs[2] & 0x0F00); + acc += yl.sc * (qs[2] & 0x00F0); + acc += yl.sd * (qs[2] & 0xF000); + + acc += yl.s6 * (qs[3] & 0x000F); + acc += yl.s7 * (qs[3] & 0x0F00); + acc += yl.se * (qs[3] & 0x00F0); + acc += yl.sf * (qs[3] & 0xF000); + + return d * (sumy * -8.f + acc); +} + +// +// This variant outputs 8 values. +// +#undef N_DST +#undef N_SIMDGROUP +#undef N_SIMDWIDTH + +#ifdef INTEL_GPU +#define N_DST 8 // each SIMD group works on 8 rows +#define N_SIMDGROUP 1 // number of SIMD groups in a thread group +#define N_SIMDWIDTH 16 // subgroup size +#elif defined (ADRENO_GPU) +#define N_DST 8 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 64 +#endif + +inline void mul_vec_q_n_f32_8x_flat( + global char * src0_q, + global half * src0_d, + global float * src1, + global float * dst, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + const ulong nb = ne00/QK4_0; + + int r0 = get_group_id(0); + int r1 = get_group_id(1); + int im = 0; + + int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST; + + int i12 = im%ne12; + int i13 = im/ne12; + + // The number of scales is the same as the number of blocks. + ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + // Each block contains QK4_0/2 uchars, hence offset for qs is as follows. + ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; + + global uchar * x = (global uchar *) src0_q + offset0_q; + global half * d = (global half *) src0_d + offset0_d; + global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1; + + float16 yl; + float8 sumf = 0.f; + + int ix = get_sub_group_local_id()/2; + int il = 8*(get_sub_group_local_id()%2); + + global float * yb = y + ix*QK4_0 + il; + + for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) { + float sumy = 0.f; + + sumy += yb[0]; + sumy += yb[1]; + sumy += yb[2]; + sumy += yb[3]; + sumy += yb[4]; + sumy += yb[5]; + sumy += yb[6]; + sumy += yb[7]; + + sumy += yb[16]; + sumy += yb[17]; + sumy += yb[18]; + sumy += yb[19]; + sumy += yb[20]; + sumy += yb[21]; + sumy += yb[22]; + sumy += yb[23]; + + yl.s0 = yb[0]; + yl.s1 = yb[1]/256.f; + + yl.s2 = yb[2]; + yl.s3 = yb[3]/256.f; + + yl.s4 = yb[4]; + yl.s5 = yb[5]/256.f; + + yl.s6 = yb[6]; + yl.s7 = yb[7]/256.f; + + yl.s8 = yb[16]/16.f; + yl.s9 = yb[17]/4096.f; + + yl.sa = yb[18]/16.f; + yl.sb = yb[19]/4096.f; + + yl.sc = yb[20]/16.f; + yl.sd = yb[21]/4096.f; + + yl.se = yb[22]/16.f; + yl.sf = yb[23]/4096.f; + + sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il); + sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il); + sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il); + sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il); + + sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il); + sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il); + sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il); + sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il); + + yb += QK4_0 * (N_SIMDWIDTH/2); + } + + float8 tot = (float8)( + sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1), + sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3), + sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5), + sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7) + ); + + if (get_sub_group_local_id() == 0) { + if (first_row + 0 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0; + } + if (first_row + 1 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1; + } + if (first_row + 2 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2; + } + if (first_row + 3 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3; + } + + if (first_row + 4 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4; + } + if (first_row + 5 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5; + } + if (first_row + 6 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6; + } + if (first_row + 7 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7; + } + } +} + +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_16 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mv_id_q4_0_f32_8x_flat( + global char * src0_q, + global half * src0_d, + global float * src1, + ulong offset1, + global char * src2, + ulong offset2, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + ulong nb00, + ulong nb02, + int ne10, + int ne11, + int ne12, + ulong nb11, + ulong nb12, + int ne20, + int ne21, + ulong nb21, + int ne0, + int ne1, + int r2, + int r3 +) { + src1 = (global float *)((global char *)src1 + offset1); + src2 = (global char *)((global char *)src2 + offset2); + dst = (global float *)((global char *)dst + offsetd); + + const int iid1 = get_group_id(2)/ne20; + const int idx = get_group_id(2)%ne20; + + const int i02 = ((global int *)(src2 + iid1*nb21))[idx]; + + const int i11 = idx%ne11; + const int i12 = iid1; + + const int i1 = idx; + const int i2 = i12; + + global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2); + global half * src0_d_cur = src0_d + (i02*nb02/nb00); + global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12); + global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0; + + mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3); +} diff --git a/ggml/src/ggml-opencl/kernels/pad.cl b/ggml/src/ggml-opencl/kernels/pad.cl new file mode 100644 index 0000000000000..747fa7febcc74 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/pad.cl @@ -0,0 +1,30 @@ +kernel void kernel_pad( + global const void * src0_ptr, + ulong src0_offset, + global void * dst_ptr, + ulong dst_offset, + int s_ne0, int s_ne1, int s_ne2, + int d_ne0, int d_ne1, int d_ne2 +) { + global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset); + global float * dst = (global float *)((global char *)dst_ptr + dst_offset); + + int nidx = get_global_id(0); + int idx_d1 = get_group_id(1); + int idx_d2 = get_group_id(2); + + if (nidx >= d_ne0) { + return; + } + + int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1; + + bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2); + + if (in_src_bounds) { + int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1; + dst[dst_el_offset] = src0[src_el_offset]; + } else { + dst[dst_el_offset] = 0.0f; + } +} diff --git a/ggml/src/ggml-opencl/kernels/repeat.cl b/ggml/src/ggml-opencl/kernels/repeat.cl new file mode 100644 index 0000000000000..079498f5ab947 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/repeat.cl @@ -0,0 +1,39 @@ +kernel void kernel_repeat( + global const char * src0_data_in, + global char * dst_data_in, + ulong src0_offset, + ulong dst_offset, + int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3, + ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3, + int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3, + ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3 +) { + global const char * src0_data = src0_data_in + src0_offset; + global char * dst_data = dst_data_in + dst_offset; + + const int d3 = get_global_id(2); + const int d2 = get_global_id(1); + const int d1 = get_global_id(0); + + if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) { + return; + } + + const int s3 = d3 % src0_ne3; + const int s2 = d2 % src0_ne2; + const int s1 = d1 % src0_ne1; + + const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1; + global char * p_dst_slice = dst_data + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1; + + for (int d0 = 0; d0 < dst_ne0; ++d0) { + // Determine source index for dimension 0 based on tiling/broadcasting. + const int s0 = d0 % src0_ne0; + + const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0; + global char * restrict current_dst_el_ptr = p_dst_slice + (ulong)d0*dst_nb0; + for (int k = 0; k < src0_nb0; ++k) { + current_dst_el_ptr[k] = current_src_el_ptr[k]; + } + } +} diff --git a/ggml/src/ggml-opencl/kernels/tanh.cl b/ggml/src/ggml-opencl/kernels/tanh.cl new file mode 100644 index 0000000000000..d9da86b148921 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/tanh.cl @@ -0,0 +1,63 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +kernel void kernel_tanh_f32_nd( + global void * p_src0_base, ulong off_src0_abs, + global void * p_dst_base, ulong off_dst_abs, + int ne00, int ne01, int ne02, int ne03, + ulong nb00, ulong nb01, ulong nb02, ulong nb03, + int ne10, int ne11, int ne12, int ne13, + ulong nb10, ulong nb11, ulong nb12, ulong nb13 +) { + int i0 = get_global_id(0); + int i1 = get_global_id(1); + int i2 = get_global_id(2); + + if (i0 < ne10 && i1 < ne11 && i2 < ne12) { + for (int i3 = 0; i3 < ne13; ++i3) { + ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03; + global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor); + + ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13; + global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor); + + *dst_val_ptr = tanh(*src_val_ptr); + } + } +} + +kernel void kernel_tanh_f16_nd( + global void * p_src0_base, ulong off_src0_abs, + global void * p_dst_base, ulong off_dst_abs, + int ne00, int ne01, int ne02, int ne03, + ulong nb00, ulong nb01, ulong nb02, ulong nb03, + int ne10, int ne11, int ne12, int ne13, + ulong nb10, ulong nb11, ulong nb12, ulong nb13 +) { + int i0 = get_global_id(0); + int i1 = get_global_id(1); + int i2 = get_global_id(2); + + if (i0 < ne10 && i1 < ne11 && i2 < ne12) { + for (int i3 = 0; i3 < ne13; ++i3) { + ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03; + global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor); + + ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13; + global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor); + + *dst_val_ptr = tanh(*src_val_ptr); + } + } +} diff --git a/ggml/src/ggml-opencl/kernels/tsembd.cl b/ggml/src/ggml-opencl/kernels/tsembd.cl new file mode 100644 index 0000000000000..4b1107f70ba7a --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/tsembd.cl @@ -0,0 +1,48 @@ +kernel void kernel_timestep_embedding( + global const void * p_timesteps, + ulong off_timesteps, + global void * p_dst, + ulong off_dst, + int dst_nb1_bytes, + int logical_dim, + int max_period +) { + int local_i; + int local_j; + int local_half_dim; + float local_timestep_val; + float local_freq; + float local_arg; + global float * local_embed_data_ptr; + global const float * local_timesteps_input_ptr; + global float * local_dst_output_base_ptr; + + local_timesteps_input_ptr = (global const float *)((global char *)p_timesteps + off_timesteps); + local_dst_output_base_ptr = (global float *)((global char *)p_dst + off_dst); + + local_i = get_global_id(1); + local_j = get_global_id(0); + + local_half_dim = logical_dim / 2; + local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes); + + if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) { + local_embed_data_ptr[logical_dim] = 0.0f; + } + + if (local_j >= local_half_dim) { + return; + } + + local_timestep_val = local_timesteps_input_ptr[local_i]; + + if (local_half_dim == 0) { + local_freq = 1.0f; + } else { + local_freq = exp(-log((float)max_period) * (float)local_j / (float)local_half_dim); + } + + local_arg = local_timestep_val * local_freq; + local_embed_data_ptr[local_j] = cos(local_arg); + local_embed_data_ptr[local_j + local_half_dim] = sin(local_arg); +} diff --git a/ggml/src/ggml-opencl/kernels/upscale.cl b/ggml/src/ggml-opencl/kernels/upscale.cl new file mode 100644 index 0000000000000..219d31dbb9248 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/upscale.cl @@ -0,0 +1,121 @@ +kernel void kernel_upscale( + global const void * p_src0, + ulong off_src0, + global void * p_dst, + ulong off_dst, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + float sf0, + float sf1, + float sf2, + float sf3 +) { + global const char * src_base = (global const char *)p_src0 + off_src0; + global float * dst_base = (global float *)((global char *)p_dst + off_dst); + + int index = get_global_id(0); + int dst_total_elements = ne10 * ne11 * ne12 * ne13; + + if (index >= dst_total_elements) { + return; + } + + int i10 = index % ne10; + int i11 = (index / ne10) % ne11; + int i12 = (index / (ne10 * ne11)) % ne12; + int i13 = index / (ne10 * ne11 * ne12); + + int i00 = (int)(i10 / sf0); + int i01 = (int)(i11 / sf1); + int i02 = (int)(i12 / sf2); + int i03 = (int)(i13 / sf3); + + ulong offset_src_element = (ulong)i03 * nb03 + (ulong)i02 * nb02 + (ulong)i01 * nb01 + (ulong)i00 * nb00; + global const float * src_element_ptr = (global const float *)(src_base + offset_src_element); + + dst_base[index] = *src_element_ptr; +} + +kernel void kernel_upscale_bilinear( + global const void * p_src0, + ulong off_src0, + global void * p_dst, + ulong off_dst, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne00_src, + int ne01_src, + int ne10_dst, + int ne11_dst, + int ne12_dst, + int ne13_dst, + float sf0, + float sf1, + float sf2, + float sf3 +) { + global const char * src_base = (global const char *)p_src0 + off_src0; + global float * dst_base = (global float *)((global char *)p_dst + off_dst); + + int index = get_global_id(0); + int dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst; + + if (index >= dst_total_elements) { + return; + } + + int i10_dst = index % ne10_dst; + int i11_dst = (index / ne10_dst) % ne11_dst; + int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst; + int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst); + + int i02_src = (int)(i12_dst / sf2); + int i03_src = (int)(i13_dst / sf3); + + const float pixel_offset = 0.5f; + + float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset; + long y0_src = (long)floor(y_src_f); + long y1_src = y0_src + 1; + + y0_src = max(0L, min(y0_src, (long)ne01_src - 1)); + y1_src = max(0L, min(y1_src, (long)ne01_src - 1)); + + float dy = y_src_f - (float)y0_src; + dy = max(0.0f, min(dy, 1.0f)); + + float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset; + long x0_src = (long)floor(x_src_f); + long x1_src = x0_src + 1; + + x0_src = max(0L, min(x0_src, (long)ne00_src - 1)); + x1_src = max(0L, min(x1_src, (long)ne00_src - 1)); + + float dx = x_src_f - (float)x0_src; + dx = max(0.0f, min(dx, 1.0f)); + + global const float * p_a = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03); + global const float * p_b = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03); + global const float * p_c = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03); + global const float * p_d = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03); + + const float val_a = *p_a; + const float val_b = *p_b; + const float val_c = *p_c; + const float val_d = *p_d; + + float result = val_a * (1.0f - dx) * (1.0f - dy) + + val_b * dx * (1.0f - dy) + + val_c * (1.0f - dx) * dy + + val_d * dx * dy; + + dst_base[index] = result; +} diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 84ec6dfe31bfc..e389a46dbed87 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -2425,8 +2425,6 @@ void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_REST } } -static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK4_NL == 0); const int64_t nb = k / QK4_NL; diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 4f0abb5a60f48..f468f796d5773 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -53,6 +53,9 @@ struct socket_t { } }; +// macro for nicer error messages on server crash +#define RPC_STATUS_ASSERT(x) if (!(x)) GGML_ABORT("Remote RPC server crashed or returned malformed response") + // all RPC structures must be packed #pragma pack(push, 1) // ggml_tensor is serialized into rpc_tensor @@ -425,7 +428,7 @@ static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cm static bool check_server_version(const std::shared_ptr & sock) { rpc_msg_hello_rsp response; bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) { fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch); return false; @@ -481,7 +484,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; rpc_msg_free_buffer_req request = {ctx->remote_ptr}; bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); delete ctx; } @@ -493,7 +496,7 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { rpc_msg_buffer_get_base_req request = {ctx->remote_ptr}; rpc_msg_buffer_get_base_rsp response; bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); ctx->base_ptr = reinterpret_cast(response.base_ptr); return ctx->base_ptr; } @@ -545,7 +548,7 @@ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_ request.tensor = serialize_tensor(tensor); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); } return GGML_STATUS_SUCCESS; } @@ -560,7 +563,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm request.hash = fnv_hash((const uint8_t*)data, size); rpc_msg_set_tensor_hash_rsp response; bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); if (response.result) { // the server has the same data, no need to send it return; @@ -573,7 +576,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size()); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); } static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -583,7 +586,7 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con request.offset = offset; request.size = size; bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); } static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { @@ -601,7 +604,7 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con request.dst = serialize_tensor(dst); rpc_msg_copy_tensor_rsp response; bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); return response.result; } @@ -609,7 +612,7 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value}; bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); } static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = { @@ -635,7 +638,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back rpc_msg_alloc_buffer_rsp response; auto sock = get_socket(buft_ctx->endpoint); bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); if (response.remote_ptr != 0) { ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_rpc_buffer_interface, @@ -650,7 +653,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back static size_t get_alignment(const std::shared_ptr & sock) { rpc_msg_get_alignment_rsp response; bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); return response.alignment; } @@ -662,7 +665,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ static size_t get_max_size(const std::shared_ptr & sock) { rpc_msg_get_max_size_rsp response; bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); return response.max_size; } @@ -683,7 +686,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_ty rpc_msg_get_alloc_size_rsp response; bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); return response.alloc_size; } else { @@ -761,7 +764,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g rpc_msg_graph_compute_rsp response; auto sock = get_socket(rpc_ctx->endpoint); bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); return (enum ggml_status)response.result; } @@ -835,7 +838,7 @@ bool ggml_backend_is_rpc(ggml_backend_t backend) { static void get_device_memory(const std::shared_ptr & sock, size_t * free, size_t * total) { rpc_msg_get_device_memory_rsp response; bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response)); - GGML_ASSERT(status); + RPC_STATUS_ASSERT(status); *free = response.free_mem; *total = response.total_mem; } diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index a2e26124802b2..efd78b912cc65 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -13,7 +13,7 @@ elseif(SUPPORTS_SYCL) If you expected the oneAPI Release compiler, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") else() - message(FATAL_ERROR, "C++ compiler lacks SYCL support.") + message(FATAL_ERROR "C++ compiler lacks SYCL support.") endif() message(STATUS "SYCL found") #todo: AOT @@ -142,7 +142,7 @@ else() FetchContent_Declare( ONEMATH GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git - GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a + GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142 ) FetchContent_MakeAvailable(ONEMATH) # Create alias to match with find_package targets name @@ -170,7 +170,7 @@ else() target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA) elseif (GGML_SYCL_TARGET STREQUAL "AMD") if (NOT GGML_SYCL_DEVICE_ARCH) - message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") + message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") endif() target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas) target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa") diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index 0a3883ae1eda5..741630dba342c 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -225,9 +225,9 @@ struct bin_bcast_sycl { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * - sycl::range<3>(1, 1, block_size), + sycl_parallel_for( + stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size), sycl::range<3>(1, 1, block_size)), [=](sycl::nd_item<3> item_ct1) { k_bin_bcast_unravel( @@ -246,9 +246,8 @@ struct bin_bcast_sycl { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02, s03, s11, s12, s13, diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 15ee9dc69d149..4e7449d06ecfe 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -149,8 +149,6 @@ typedef sycl::float2 dfloat2; #define MMVQ_MAX_BATCH_SIZE 8 -static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - static int g_all_sycl_device_count = -1; static bool g_ggml_backend_sycl_buffer_type_initialized = false; @@ -201,7 +199,7 @@ struct sycl_device_info { // size_t smpb; // max. shared memory per block bool vmm; // virtual memory support size_t total_vram; - sycl_hw_info hw_info; + //sycl_hw_info hw_info; \\ device id and aarch, currently not used optimize_feature opt_feature; }; @@ -288,29 +286,6 @@ struct ggml_tensor_extra_gpu { void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); -inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { - optimize_feature opt; - - opt.reorder = - (arch == syclex::architecture::intel_gpu_dg1 || - arch == syclex::architecture::intel_gpu_acm_g10 || - arch == syclex::architecture::intel_gpu_acm_g11 || - arch == syclex::architecture::intel_gpu_acm_g12 || - arch == syclex::architecture::intel_gpu_pvc || - arch == syclex::architecture::intel_gpu_pvc_vg || - arch == syclex::architecture::intel_gpu_mtl_u || - arch == syclex::architecture::intel_gpu_mtl_s || - arch == syclex::architecture::intel_gpu_mtl_h || - arch == syclex::architecture::intel_gpu_arl_u || - arch == syclex::architecture::intel_gpu_arl_s || - arch == syclex::architecture::intel_gpu_arl_h || - arch == syclex::architecture::intel_gpu_bmg_g21 || - arch == syclex::architecture::intel_gpu_lnl_m - ); - - return opt; -} - namespace sycl_ex = sycl::ext::oneapi::experimental; struct ggml_backend_sycl_context { int device; @@ -515,9 +490,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) { bool gpu_has_xmx(sycl::device &dev); -template void debug_print_array(const std::string & prefix, const T array[N]) { +template std::string debug_get_array_str(const std::string & prefix, const T array[N]) { if (LIKELY(!g_ggml_sycl_debug)) { - return; + return ""; } std::stringstream ss; ss << prefix << "=["; @@ -528,29 +503,26 @@ template void debug_print_array(const std::string & prefix, con ss << array[N - 1]; } ss << "]"; - GGML_SYCL_DEBUG("%s", ss.str().c_str()); + return ss.str(); } -inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor, - const std::string & suffix = "") { - if (LIKELY(!g_ggml_sycl_debug)) { - return; - } - GGML_SYCL_DEBUG("%s=", prefix.c_str()); +inline std::string debug_get_tensor_str(const std::string &prefix, + const ggml_tensor *tensor, const std::string &suffix = "") { + std::stringstream ss; + if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); } + ss << prefix.c_str() << "="; if (tensor) { - GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type)); - debug_print_array(";ne", tensor->ne); - debug_print_array(";nb", tensor->nb); - if (!ggml_is_contiguous(tensor)) { - GGML_SYCL_DEBUG(";strided"); - } - if (ggml_is_permuted(tensor)) { - GGML_SYCL_DEBUG(";permuted"); - } + ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type); + ss << debug_get_array_str(";ne", tensor->ne); + ss << debug_get_array_str(";nb", tensor->nb); + + if (!ggml_is_contiguous(tensor)) { ss << ";strided"; } + if (ggml_is_permuted(tensor)) { ss << ";permuted"; } } else { - GGML_SYCL_DEBUG("nullptr"); + ss << "nullptr"; } - GGML_SYCL_DEBUG("%s", suffix.c_str()); + ss << suffix; + return ss.str(); } // Use scope_op_debug_print to log operations coming from running a model @@ -566,10 +538,10 @@ struct scope_op_debug_print { return; } GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data()); - debug_print_tensor(" dst", dst); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str()); if (dst) { for (std::size_t i = 0; i < num_src; ++i) { - debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str()); } } GGML_SYCL_DEBUG("%s\n", suffix.data()); diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index 7aa91c861d583..3501484a14611 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -89,33 +89,24 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst, sycl::range<3> gridDim(ne2, ne1, num_blocks); switch (dim) { case 0: - stream->parallel_for( - sycl::nd_range<3>(gridDim * - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); - }); - break; + sycl_parallel_for(stream, + sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); }); + break; case 1: - stream->parallel_for( - sycl::nd_range<3>(gridDim * - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); - }); - break; + sycl_parallel_for(stream, + sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); }); + break; // dim >=2 will be dispatched to the default path default: - stream->parallel_for( - sycl::nd_range<3>(gridDim * - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); - }); - break; + sycl_parallel_for(stream, + sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); }); + break; } } @@ -129,33 +120,29 @@ static void concat_f32_sycl_non_cont( int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2, uint64_t nb3, int32_t dim) { sycl::range<3> gridDim(ne3, ne2, ne1); - stream->parallel_for( - sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - int64_t i3 = item_ct1.get_group(0); - int64_t i2 = item_ct1.get_group(1); - int64_t i1 = item_ct1.get_group(2); + sycl_parallel_for(stream, sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + int64_t i3 = item_ct1.get_group(0); + int64_t i2 = item_ct1.get_group(1); + int64_t i1 = item_ct1.get_group(2); - int64_t o[4] = {0, 0, 0, 0}; - o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); + int64_t o[4] = { 0, 0, 0, 0 }; + o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); - const float *x; + const float * x; - for (int i0 = item_ct1.get_local_id(2); i0 < ne0; - i0 += item_ct1.get_local_range(2)) { + for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) { if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 + - (i0)*nb00); + x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00); } else { - x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + - (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10); + x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 + + (i0 - o[0]) * nb10); } float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0); *y = *x; - } - }); + } + }); } void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp index 475bd34a25d56..c2f991e8d64a7 100644 --- a/ggml/src/ggml-sycl/conv.cpp +++ b/ggml/src/ggml-sycl/conv.cpp @@ -59,16 +59,10 @@ static void conv_transpose_1d_f32_f32_sycl( const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE; const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE); const sycl::range<3> block_nums(1, 1, num_blocks); - stream->parallel_for( - sycl::nd_range<3>( - block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - conv_transpose_1d_kernel( - s0, output_size, - src0_ne0, src0_ne1, src0_ne2, - src1_ne0, dst_ne0, - src0, src1, dst, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + conv_transpose_1d_kernel(s0, output_size, src0_ne0, src0_ne1, src0_ne2, src1_ne0, dst_ne0, src0, src1, dst, + item_ct1); + }); } void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index 75bac98e5fb64..0ef567122dddb 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -33,14 +33,11 @@ static void dequantize_block_sycl(const void *__restrict__ vx, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>( - sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block(vx, y, k, item_ct1); - }); + sycl_parallel_for( + stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block(vx, y, k, item_ct1); }); } } @@ -53,24 +50,18 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q2_K(vx, y, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); }); } #else { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q2_K(vx, y, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); }); } #endif @@ -85,24 +76,18 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q3_K(vx, y, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); }); } #else { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q3_K(vx, y, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); }); } #endif } @@ -116,12 +101,9 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_0(vx, y, nb32, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_0(vx, y, nb32, item_ct1); }); } } @@ -135,13 +117,12 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int int constexpr WARP_K = WARP_SIZE * QK4_0; const int n_warp = (k + WARP_K - 1) / WARP_K; GGML_ASSERT(k % 2 == 0); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * - sycl::range<3>(1, 1, WARP_SIZE), - sycl::range<3>(1, 1, WARP_SIZE)), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{ - dequantize_block_q4_0_reorder(vx, y, k, item_ct1); - }); - + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE), + sycl::range<3>(1, 1, WARP_SIZE)), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_block_q4_0_reorder(vx, y, k, item_ct1); + }); } template @@ -153,12 +134,9 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_1(vx, y, nb32, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_1(vx, y, nb32, item_ct1); }); } } @@ -171,14 +149,13 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor scale_local_acc(sycl::range<1>(12), cgh); - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1); - }); + sycl_parallel_for( + cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1); + }); }); } } @@ -191,13 +168,13 @@ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const i dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - stream->submit([&](sycl::handler & cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor scale_local_acc(sycl::range<1>(12), cgh); - cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)), - [=](sycl::nd_item<1> item_ct1) { - dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb); - }); + sycl_parallel_for<1>(cgh, sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)), + [=](sycl::nd_item<1> item_ct1) { + dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb); + }); }); } @@ -210,24 +187,18 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q5_K(vx, y, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); }); } #else { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q5_K(vx, y, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); }); } #endif @@ -242,29 +213,34 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q6_K(vx, y, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); }); } #else { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q6_K(vx, y, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); }); } #endif } +template +static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); }); +} + template static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k, dpct::queue_ptr stream) { @@ -273,15 +249,10 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq1_s( - vx, y, item_ct1, iq1s_grid_gpu - ); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_s(vx, y, item_ct1, iq1s_grid_gpu); }); }); } } @@ -294,15 +265,10 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq1_m( - vx, y, item_ct1, iq1s_grid_gpu - ); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_m(vx, y, item_ct1, iq1s_grid_gpu); }); }); } } @@ -315,15 +281,12 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_xxs( - vx, y, item_ct1, iq2xxs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_xxs(vx, y, item_ct1, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs); + }); }); } } @@ -336,15 +299,12 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_xs( - vx, y, item_ct1, iq2xs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_xs(vx, y, item_ct1, iq2xs_grid, ksigns_iq2xs, kmask_iq2xs); + }); }); } } @@ -357,13 +317,10 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_s(vx, y, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_s(vx, y, item_ct1); }); }); } } @@ -377,15 +334,12 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq3_xxs( - vx, y, item_ct1, iq3xxs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq3_xxs(vx, y, item_ct1, iq3xxs_grid, ksigns_iq2xs, kmask_iq2xs); + }); }); } } @@ -398,14 +352,10 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq3_s( - vx, y, item_ct1, kmask_iq2xs, iq3s_grid); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s(vx, y, item_ct1, kmask_iq2xs, iq3s_grid); }); }); } } @@ -421,14 +371,11 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq4_xs(vx, y, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_xs(vx, y, item_ct1); }); }); } #endif @@ -442,14 +389,11 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq4_nl(vx, y, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for( + cgh, + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_nl(vx, y, item_ct1); }); }); } } @@ -530,7 +474,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { case GGML_TYPE_Q5_K: return dequantize_row_q5_K_sycl; case GGML_TYPE_Q6_K: - return dequantize_row_q6_K_sycl; + if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q6_K_sycl_reorder; + } else { + return dequantize_row_q6_K_sycl; + } case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_sycl; case GGML_TYPE_IQ1_M: @@ -587,7 +535,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) { case GGML_TYPE_Q5_K: return dequantize_row_q5_K_sycl; case GGML_TYPE_Q6_K: - return dequantize_row_q6_K_sycl; + if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q6_K_sycl_reorder; + } else { + return dequantize_row_q6_K_sycl; + } case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_sycl; case GGML_TYPE_IQ1_M: diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 44487c25646d6..1ffd7f1226724 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -1,8 +1,12 @@ #include "cpy.hpp" #include +#include #include "dequantize.hpp" +#include "ggml-sycl/common.hpp" +#include "ggml-sycl/presets.hpp" +#include "ggml.h" static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { if (x <= val[0]) { @@ -116,6 +120,15 @@ static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { } } +/* quantized type same copy */ +template +static void cpy_blck_q_q(const char * cxi, char * cdsti) { + const T * xi = (const T *) cxi; + T * dsti = (T *) cdsti; + *dsti = *xi; +} + + static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { float * cdstf = (float *) (cdsti); @@ -311,6 +324,34 @@ template static void cpy_blck_q_f32(const } } + +template +static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; + + if (i >= ne) { + return; + } + + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_blck_q_q(cx + x_offset, cdst + dst_offset); +} + template static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, @@ -322,6 +363,7 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00 return; } + const int i03 = i / (ne00 * ne01 * ne02); const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; @@ -371,7 +413,8 @@ static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, co { dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - stream->parallel_for( + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { @@ -389,7 +432,8 @@ static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, co { dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - stream->parallel_for( + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { @@ -407,7 +451,8 @@ static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, co { dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - stream->parallel_for( + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { @@ -423,11 +468,11 @@ static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, c const int nb12, const int nb13, queue_ptr stream) { GGML_ASSERT(ne % QK8_0 == 0); const int num_blocks = ne / QK8_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); } static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, @@ -435,11 +480,11 @@ static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, c const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { const int num_blocks = ne; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_q_f32(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_q_f32(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); } static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, @@ -448,11 +493,11 @@ static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, c const int nb12, const int nb13, queue_ptr stream) { GGML_ASSERT(ne % QK4_0 == 0); const int num_blocks = ne / QK4_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); } static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, @@ -460,8 +505,9 @@ static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, c const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { cpy_q_f32, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); @@ -474,11 +520,11 @@ static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, c const int nb12, const int nb13, queue_ptr stream) { GGML_ASSERT(ne % QK4_1 == 0); const int num_blocks = ne / QK4_1; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); } static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, @@ -486,8 +532,9 @@ static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, c const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { cpy_q_f32, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); @@ -500,11 +547,11 @@ static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, c const int nb12, const int nb13, queue_ptr stream) { GGML_ASSERT(ne % QK5_0 == 0); const int num_blocks = ne / QK5_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); } static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, @@ -512,8 +559,9 @@ static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, c const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { cpy_q_f32, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); @@ -526,11 +574,11 @@ static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, c const int nb12, const int nb13, queue_ptr stream) { GGML_ASSERT(ne % QK5_1 == 0); const int num_blocks = ne / QK5_1; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); } static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, @@ -538,8 +586,9 @@ static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, c const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { cpy_q_f32, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); @@ -552,11 +601,11 @@ static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne, const int nb12, const int nb13, queue_ptr stream) { GGML_ASSERT(ne % QK4_NL == 0); const int num_blocks = ne / QK4_NL; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, - ne12, nb10, nb11, nb12, nb13, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); } static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, @@ -567,7 +616,8 @@ static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, co { dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - stream->parallel_for( + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { @@ -586,7 +636,8 @@ static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, co // dpct::has_capability_or_fail(stream->get_device(), // {sycl::aspect::fp16}); - stream->parallel_for( + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { @@ -605,7 +656,8 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co // dpct::has_capability_or_fail(stream->get_device(), // {sycl::aspect::fp16}); - stream->parallel_for( + sycl_parallel_for( + stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { @@ -615,10 +667,85 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co } } +static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, + ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, + ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, + ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, + ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, + ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field - scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, - std::string(" src0 type=") + ggml_type_name(src0->type)); + scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0)); const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); @@ -632,8 +759,10 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co char * src0_ddc = (char *) src0->data; char * src1_ddc = (char *) src1->data; - - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) { + GGML_SYCL_DEBUG("%s: memcpy path\n", __func__); + main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0)); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { @@ -684,6 +813,16 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_q8_0_q8_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_Q5_0) { + ggml_cpy_q5_0_q5_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_Q5_1) { + ggml_cpy_q5_1_q5_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_q4_0_q4_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_q4_1_q4_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else { GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index 64e92f73f26c8..540539bb22381 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -538,6 +538,38 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri #endif } +template +static void dequantize_block_q6_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> & item_ct1, int64_t n_blocks) { + const int64_t ib = item_ct1.get_group(2); + + const int64_t tid = item_ct1.get_local_id(2); + const int64_t ip = tid / 32; // ip is 0 or 1 + const int64_t il = tid - 32 * ip; // 0...32 + const int64_t is = 8 * ip + il / 16; + + const uint8_t * base_ptr = static_cast(vx); + const auto ql_offset = ib * (QK_K / 2); + const auto qh_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * ib; + const auto base_scales_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * n_blocks + (QK_K / 16) * ib; + const auto base_d_offset = ((QK_K / 2) + (QK_K / 4) + (QK_K / 16)) * n_blocks; + const uint8_t * ql_ptr = base_ptr + ql_offset; + const uint8_t * qh_ptr = base_ptr + qh_offset; + const uint8_t * scales_ptr = base_ptr + base_scales_offset; + const ggml_half * d = (const ggml_half *) (base_ptr + base_d_offset) + ib; + + dst_t * y = yy + ib * QK_K + 128 * ip + il; + + const uint8_t * ql = ql_ptr + 64 * ip + il; + const uint8_t qh = *(qh_ptr + 32 * ip + il); + const int8_t * sc = reinterpret_cast(scales_ptr + is); + + y[0] = *d * sc[0] * ((int8_t) ((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = *d * sc[2] * ((int8_t) ((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = *d * sc[4] * ((int8_t) ((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = *d * sc[6] * ((int8_t) ((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +} + template static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, const sycl::nd_item<3> &item_ct1, diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 4f2760110c212..70579c0c3be11 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -208,12 +208,10 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, - nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1); + }); } } @@ -877,12 +875,11 @@ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloa dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec_reorder( - vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec_reorder(vx, y, dst, ncols, + nrows, item_ct1); + }); } } @@ -900,12 +897,10 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1); + }); } } @@ -921,12 +916,10 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1); + }); } } @@ -942,12 +935,10 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1); + }); } } @@ -963,12 +954,10 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1); + }); } } @@ -984,12 +973,10 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1); + }); } } @@ -1002,11 +989,10 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); + }); } static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, @@ -1018,11 +1004,10 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); + }); } static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, @@ -1034,11 +1019,10 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); + }); } static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, @@ -1047,11 +1031,10 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); + }); } static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, @@ -1063,11 +1046,10 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); + }); } void ggml_sycl_op_dequantize_mul_mat_vec( diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index d538965b096bf..27c7278607832 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -13,10 +13,10 @@ #ifndef GGML_SYCL_DPCT_HELPER_HPP #define GGML_SYCL_DPCT_HELPER_HPP +#include #include #include #include -#include #ifdef GGML_SYCL_USE_INTEL_ONEMKL #include @@ -118,6 +118,36 @@ inline auto get_onemath_backend(sycl::queue& queue) #endif } +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS + namespace syclex = sycl::ext::oneapi::experimental; +#endif + +template +__dpct_inline__ void sycl_parallel_for(sycl::handler & cgh, sycl::nd_range nd_range, Func && func) { +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS + syclex::nd_launch(cgh, nd_range, func); +#else + cgh.parallel_for(nd_range, func); +#endif +} + +template +__dpct_inline__ void sycl_parallel_for(sycl::queue * q, sycl::nd_range nd_range, Func && func) { +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS + syclex::nd_launch(*q, nd_range, func); +#else + q->parallel_for(nd_range, func); +#endif +} + +template __dpct_inline__ void sycl_launch(sycl::queue * stream, Func && func) { +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS + syclex::submit(*stream, func); +#else + stream->submit(func); +#endif +} + namespace dpct { typedef sycl::queue *queue_ptr; diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 5b7c4f0b4f003..c56924ce8322f 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -329,60 +329,51 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst, const int ne12, const int nb1, const int nb2, const int offset, queue_ptr stream) { int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, - item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1); + }); } template static void gelu_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - gelu(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { gelu(x, dst, k, item_ct1); }); } template static void silu_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - silu(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { silu(x, dst, k, item_ct1); }); } template static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) { // hard code for now const int num_blocks = ceil_div(k, 256); - stream->parallel_for( - sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { - sgn(x, dst, k, item_ct1); - }); + sycl_parallel_for( + stream, sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { sgn(x, dst, k, item_ct1); }); } template static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) { // hard code for now const int num_blocks = ceil_div(k, 256); - stream->parallel_for( - sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { - abs_op(x, dst, k, item_ct1); - }); + sycl_parallel_for( + stream, + sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { abs_op(x, dst, k, item_ct1); }); } @@ -390,23 +381,20 @@ template static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) { // hard code for now const int num_blocks = ceil_div(k, 256); - stream->parallel_for( - sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { - elu_op(x, dst, k, item_ct1); - }); + sycl_parallel_for( + stream, + sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), + [=](sycl::nd_item<3> item_ct1) { elu_op(x, dst, k, item_ct1); }); } template static void gelu_quick_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - gelu_quick(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { gelu_quick(x, dst, k, item_ct1); }); } @@ -414,169 +402,133 @@ template static void gelu_erf_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - gelu_erf(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { gelu_erf(x, dst, k, item_ct1); }); } template static void tanh_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - tanh(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { tanh(x, dst, k, item_ct1); }); } template static void relu_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - relu(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { relu(x, dst, k, item_ct1); }); } template static void hardsigmoid_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE), + sycl_parallel_for( + stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - hardsigmoid(x, dst, k, item_ct1); - }); + [=](sycl::nd_item<3> item_ct1) { hardsigmoid(x, dst, k, item_ct1); }); } template static void hardswish_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE), + sycl_parallel_for( + stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - hardswish(x, dst, k, item_ct1); - }); + [=](sycl::nd_item<3> item_ct1) { hardswish(x, dst, k, item_ct1); }); } template static void exp_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - exp(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { exp(x, dst, k, item_ct1); }); } template static void log_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - log(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { log(x, dst, k, item_ct1); }); } template static void neg_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - neg(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { neg(x, dst, k, item_ct1); }); } template static void step_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - step(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { step(x, dst, k, item_ct1); }); } template static void sigmoid_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE), + sycl_parallel_for( + stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - sigmoid(x, dst, k, item_ct1); - }); + [=](sycl::nd_item<3> item_ct1) { sigmoid(x, dst, k, item_ct1); }); } template static void sqrt_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - sqrt(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { sqrt(x, dst, k, item_ct1); }); } template static void sin_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - sin(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { sin(x, dst, k, item_ct1); }); } template static void cos_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cos(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { cos(x, dst, k, item_ct1); }); } template @@ -584,26 +536,20 @@ static void leaky_relu_sycl(const T *x, T *dst, const int k, const float negative_slope, queue_ptr stream) { const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - leaky_relu(x, dst, k, negative_slope, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { leaky_relu(x, dst, k, negative_slope, item_ct1); }); } template static void sqr_sycl(const T *x, T *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - sqr(x, dst, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { sqr(x, dst, k, item_ct1); }); } template @@ -614,9 +560,8 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01, int dst_size = ne10 * ne11 * ne12 * ne13; int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + sycl_parallel_for<1>( + stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1); }); } @@ -627,12 +572,10 @@ static void pad_sycl(const T *x, T *dst, const int ne00, const int ne1, const int ne2, queue_ptr stream) { int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE; sycl::range<3> gridDim(ne2, ne1, num_blocks); - stream->parallel_for( - sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); }); } template @@ -640,13 +583,10 @@ static void clamp_sycl(const T *x, T *dst, const float min, const float max, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - clamp(x, dst, min, max, k, item_ct1); - }); + sycl_parallel_for(stream, + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { clamp(x, dst, min, max, k, item_ct1); }); } inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp index 6cbc7e0f6938c..5efe03d364b1b 100644 --- a/ggml/src/ggml-sycl/gemm.hpp +++ b/ggml/src/ggml-sycl/gemm.hpp @@ -65,6 +65,9 @@ class DnnlGemmWrapper { dnnl::primitive_attr primitive_attr; primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); +#ifdef GGML_SYCL_F16 + primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16); +#endif auto a_mem = dnnl::memory(a_in_md, eng, const_cast(a)); auto b_mem = dnnl::memory(b_in_md, eng, const_cast(b)); diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp index 4a7712781364e..9c76ffeb9508a 100644 --- a/ggml/src/ggml-sycl/getrows.cpp +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -60,54 +60,6 @@ static void k_get_rows( dst_row[iybs + iqs + y_offset] = v.y(); } -template -static void k_get_rows_reorder( - const void * src0, const void *src0_dq, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12, - const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { - - const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + - item_ct1.get_local_id(2)) * - 2; - const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) / - ne12; - const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) % - ne12; - - if (i00 >= ne00) { - return; - } - auto ncols = ne00; - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - - const int src0_off = i01 * ncols + i00; - const int ib = src0_off / QK4_0; // block index - const int iqs = (i00%qk)/qr; // x quant index - const int iybs = i00 - i00%qk; // dst block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel_recorder((const void *)src0_dq, ib, (const void *)src0, src0_off/2, v); - - dst_row[iybs + iqs + 0] = v.x(); - dst_row[iybs + iqs + y_offset] = v.y(); - - GGML_UNUSED(nb01); - GGML_UNUSED(nb02); - GGML_UNUSED(nb03); -} - template static void k_get_rows_float( const src0_t * src0, const int32_t * src1, dst_t * dst, @@ -166,58 +118,15 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr GGML_ASSERT(ne00 % 2 == 0); - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_get_rows( - src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, - s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); - }); - - GGML_UNUSED(dst); - GGML_UNUSED(ctx); -} - -template -static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const void *src0_dd, - const int32_t *src1_dd, float *dst_dd, - queue_ptr stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); - const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE); - const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - GGML_ASSERT(ne00 % 2 == 0); - - const uint8_t* src0_q = (const uint8_t*)src0_dd; - const size_t ncols = ne00; - const size_t nrows = ne01; - const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2); - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{ - k_get_rows_reorder( - src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2, - s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_get_rows(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, + item_ct1); + }); GGML_UNUSED(dst); GGML_UNUSED(ctx); } - template static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -245,9 +154,8 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); }); @@ -277,13 +185,8 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { src1_i32, (float *)dst->data, ctx.stream()); break; case GGML_TYPE_Q4_0: - if (ctx.opt_feature.reorder && dst->op == GGML_OP_MUL_MAT) { - get_rows_sycl_reorder(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - } else { - get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - } + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); break; case GGML_TYPE_Q4_1: get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index bcd2ea5366f76..9cb36ae99e7f5 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -83,9 +83,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.devices[i].cc = 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - info.devices[i].hw_info = get_device_hw_info(&device); - info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch); - + info.devices[i].opt_feature.reorder = !device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); info.max_work_group_sizes[i] = prop.get_max_work_group_size(); } @@ -195,7 +193,7 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1); + g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); @@ -347,14 +345,15 @@ static enum ggml_status ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor, "\n"); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str()); ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; if (tensor->view_src != NULL) { assert(tensor->view_src->buffer->buft == buffer->buft); return GGML_STATUS_SUCCESS; } - if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) { + if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) && + !g_ggml_sycl_disable_optimize) { ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; tensor->extra = extra; ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx. @@ -384,7 +383,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, const void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); @@ -412,7 +411,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -443,8 +442,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, ggml_tensor *dst) try { bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer); GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": dst=", dst); - debug_print_tensor(" src=", src); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str()); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str()); GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported); if (is_cpy_supported) { ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; @@ -524,7 +523,7 @@ catch (sycl::exception const &exc) { static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value); ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; SYCL_CHECK(ggml_sycl_set_device(ctx->device)); @@ -804,7 +803,7 @@ static enum ggml_status ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor, "\n"); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str()); GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; @@ -890,7 +889,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); @@ -946,7 +945,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); @@ -1434,6 +1433,59 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, reinterpret_cast(y[ib].ds.y()) = sum; } +template +static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor, + const int kx, const int kx_padded, const sycl::nd_item<1> & it) { + /* + Quantizes and reorders the resultant q8 tensor in a per row fashion + Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values + */ + + auto subgroup_id = it.get_group(0); + auto wi_id = it.get_local_id(0); + + const int num_blocks_per_row = kx / QK8_1; + auto row = subgroup_id / num_blocks_per_row; + auto col = subgroup_id % num_blocks_per_row; + + auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1); + auto col_offset = QK8_1 * col + wi_id * ElementsPerWI; + + auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset); + auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2)); + + sycl::vec wi_f32_vals; + sycl::vec quantized_values; + + auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id; + wi_f32_vals = *reinterpret_cast *>(x + float_ptr_offset); + + float sum = 0.0f; + float amax = 0.0f; + +#pragma unroll(ElementsPerWI) + for (int i = 0; i < ElementsPerWI; i++) { + sum += wi_f32_vals[i]; + amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i])); + quantized_values[i] = 0; + } + sum = sycl::reduce_over_group(it.get_group(), sum, sycl::plus()); + amax = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum()); + float d = amax == 0 ? 1 : amax / 127; + +#pragma unroll(ElementsPerWI) + for (int i = 0; i < ElementsPerWI; i++) { + quantized_values[i] = sycl::round(wi_f32_vals[i] / d); + } + + d = amax == 0 ? 0 : d; + + *reinterpret_cast *>(quant_ptr) = quantized_values; + if (wi_id == 0) { + *ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum)); + } +} + static void mul_mat_p021_f16_f32( const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, @@ -1718,23 +1770,30 @@ static void pool2d_nchw_kernel( o_ptr[cur_oh * ow + cur_ow] = res; } -static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, - const int ky, const int kx_padded, - queue_ptr stream) { - const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE; - const sycl::range<3> num_blocks(1, ky, block_num_x); - int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE; - static_assert(QK8_1 % WARP_SIZE == 0); - const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); +static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded, + bool reorder_q8_tensor, queue_ptr stream) { + if (reorder_q8_tensor) { + auto local_range = std::size_t(WARP_SIZE); + auto num_quant_blocks = ky * (kx / QK8_1); + auto global_range = num_quant_blocks * local_range; + stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), + [=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + quantize_and_reorder_q8_1(x, vy, kx, kx_padded, it); + }); + } else { + const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE; + const sycl::range<3> num_blocks(1, ky, block_num_x); + int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE; + static_assert(QK8_1 % WARP_SIZE == 0); + const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE); + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - stream->parallel_for( - sycl::nd_range<3>(num_blocks * block_size, block_size), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - quantize_q8_1(x, vy, kx, kx_padded, item_ct1); - }); + stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + quantize_q8_1(x, vy, kx, kx_padded, item_ct1); + }); + } } } @@ -1826,13 +1885,12 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, const size_t shared_mem = ncols_pad * sizeof(int); if (order == GGML_SORT_ORDER_ASC) { - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor dpct_local_acc_ct1( sycl::range<1>(shared_mem), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_argsort_f32_i32( x, dst, ncols, ncols_pad, item_ct1, dpct_local_acc_ct1.get_multi_ptr() @@ -1840,13 +1898,12 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, }); }); } else if (order == GGML_SORT_ORDER_DESC) { - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor dpct_local_acc_ct1( sycl::range<1>(shared_mem), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_argsort_f32_i32( x, dst, ncols, ncols_pad, item_ct1, dpct_local_acc_ct1.get_multi_ptr() @@ -1864,50 +1921,47 @@ static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols, const sycl::range<3> block_nums(1, nrows, 1); const size_t shared_mem = 256 * sizeof(float); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor shared_data( sycl::range<1>(shared_mem/sizeof(float)), cgh); sycl::local_accessor shared_indices( sycl::range<1>(shared_mem/sizeof(float)), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - const int tid = item_ct1.get_local_id(2); - const int row = item_ct1.get_global_id(1); - - float max_val = -INFINITY; - int max_idx = -1; - - for (int col = tid; col < ncols; col += 256) { - float val = x[row * ncols + col]; - if (val > max_val) { - max_val = val; - max_idx = col; - } - } + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + const int tid = item_ct1.get_local_id(2); + const int row = item_ct1.get_global_id(1); - shared_data[tid] = max_val; - shared_indices[tid] = max_idx; - item_ct1.barrier(sycl::access::fence_space::local_space); + float max_val = -INFINITY; + int max_idx = -1; - for (int stride = 256/2; stride > 0; stride >>= 1) { - if (tid < stride) { - float val1 = shared_data[tid]; - float val2 = shared_data[tid + stride]; - if (val2 > val1) { - shared_data[tid] = val2; - shared_indices[tid] = shared_indices[tid + stride]; - } - } - item_ct1.barrier(sycl::access::fence_space::local_space); + for (int col = tid; col < ncols; col += 256) { + float val = x[row * ncols + col]; + if (val > max_val) { + max_val = val; + max_idx = col; } + } + shared_data[tid] = max_val; + shared_indices[tid] = max_idx; + item_ct1.barrier(sycl::access::fence_space::local_space); - if (tid == 0) { - dst[row] = shared_indices[0]; + for (int stride = 256 / 2; stride > 0; stride >>= 1) { + if (tid < stride) { + float val1 = shared_data[tid]; + float val2 = shared_data[tid + stride]; + if (val2 > val1) { + shared_data[tid] = val2; + shared_indices[tid] = shared_indices[tid + stride]; + } } - }); + item_ct1.barrier(sycl::access::fence_space::local_space); + } + + if (tid == 0) { + dst[row] = shared_indices[0]; + } + }); }); } static void diag_mask_inf_f32_sycl(const float *x, float *dst, @@ -2066,21 +2120,18 @@ inline void ggml_sycl_op_mul_mat_sycl( const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16 ? (const sycl::half *)src1->data + src1_padded_row_size : src1_as_f16.get(); - ggml_sycl_pool_alloc dst_f16(ctx.pool(), row_diff * src1_ncols); #if GGML_SYCL_DNNL if (!g_ggml_sycl_disable_dnn) { DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt(), src0_ptr, DnnlGemmWrapper::to_dt(), - dst_f16.get(), DnnlGemmWrapper::to_dt(), stream); - scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2, - " : converting dst to fp32"); - const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst); - to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream); + dst_dd_i, DnnlGemmWrapper::to_dt(), stream); } else #endif { + ggml_sycl_pool_alloc dst_f16(ctx.pool(), row_diff * src1_ncols); + const sycl::half alpha_f16 = 1.0f; const sycl::half beta_f16 = 0.0f; SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( @@ -2446,9 +2497,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); if (src1_on_device && src1_is_contiguous) { + bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder; scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst, /*num_src=*/2, " : converting src1 to Q8_1"); - quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream); + quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream); /* DPCT1010:90: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to @@ -2554,7 +2606,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten if (convert_src1_to_q8_1 && !src1_is_contiguous) { scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst, /*num_src=*/2, " : converting src1 to Q8_1"); - quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); + quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream); /* DPCT1010:92: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You @@ -2893,7 +2945,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons void ** ptrs_dst_get = ptrs_dst.get(); size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half); size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half); - cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02, nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1); }); @@ -2928,6 +2980,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) { case GGML_TYPE_Q4_0: return true; case GGML_TYPE_Q4_K: + case GGML_TYPE_Q6_K: return !g_ggml_sycl_prioritize_dmmv; default: return false; @@ -2947,6 +3000,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_K: + case GGML_TYPE_Q6_K: return true; default: return false; @@ -3031,6 +3085,50 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d sycl::free(tmp_buf, *stream); } +static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) { + GGML_ASSERT(size % sizeof(block_q6_K) == 0); + GGML_ASSERT(offset % sizeof(block_q6_K) == 0); + + const int nblocks = size / sizeof(block_q6_K); + + auto * tmp_buf = sycl::malloc_shared(size, *stream); + SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait())); + + auto * ql_ptr = data_device; + auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks; + auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks; + sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks); + + stream + ->parallel_for(nblocks, + [=](auto i) { + const block_q6_K * x = (const block_q6_K *) tmp_buf; + const int ib = i; + + const uint8_t * ql = x[ib].ql; + const uint8_t * qh = x[ib].qh; + uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib; + uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib; + uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib; + + for (int j = 0; j < QK_K / 2; ++j) { + base_ql_ptr[j] = ql[j]; + } + for (int j = 0; j < QK_K / 4; ++j) { + base_qh_ptr[j] = qh[j]; + } + + for (int j = 0; j < QK_K / 16; ++j) { + base_scales_ptr[j] = x[ib].scales[j]; + } + + dm_ptr[ib] = x[ib].d; + }) + .wait_and_throw(); + + sycl::free(tmp_buf, *stream); +} + static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { uint8_t * data_device = (uint8_t *) src0->data; size_t ncols = src0->ne[0]; @@ -3044,6 +3142,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { case GGML_TYPE_Q4_K: reorder_qw_q4_k(data_device, size, 0, stream); break; + case GGML_TYPE_Q6_K: + reorder_qw_q6_k(data_device, size, 0, stream); + break; default: GGML_ABORT("reorder_qw() called with unsupported type"); break; @@ -3348,7 +3449,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, { sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u)); sycl::range<3> grid_dims(1, n_ids, ids->ne[1]); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor src1_row_acc(cgh); char *__restrict src1_contiguous_get = @@ -3360,9 +3461,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, size_t ids_nb_ct6 = ids->nb[1]; size_t ids_nb_ct7 = ids->nb[0]; - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_copy_src1_to_contiguous( src1_original, src1_contiguous_get, dev_cur_src1_row_get, @@ -3393,15 +3493,14 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, { sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u)); sycl::range<3> grid_dims(1, 1, num_src1_rows); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { const char *__restrict dst_contiguous_get = dst_contiguous.get(); const mmid_row_mapping *__restrict dev_row_mapping_get = dev_row_mapping.get(); - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_copy_dst_from_contiguous(dst_original, dst_contiguous_get, dev_row_mapping_get, @@ -3755,7 +3854,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, const void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -3776,7 +3875,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -3799,8 +3898,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer); GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": dst=", dst); - debug_print_tensor(" src=", src); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str()); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str()); GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported); if (is_cpy_supported) { /* @@ -4165,6 +4264,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g { ggml_type src0_type = op->src[0]->type; ggml_type src1_type = op->src[1]->type; + if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) && src0_type != GGML_TYPE_BF16) { + return true; + } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return true; } @@ -4210,6 +4312,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) { return true; } + if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) { + return true; + } + if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) { + return true; + } + if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) { + return true; + } + if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) { + return true; + } + if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) { + return true; + } return false; } case GGML_OP_CONCAT: diff --git a/ggml/src/ggml-sycl/gla.cpp b/ggml/src/ggml-sycl/gla.cpp index 879184fdd3111..b40cbf1f14fb2 100644 --- a/ggml/src/ggml-sycl/gla.cpp +++ b/ggml/src/ggml-sycl/gla.cpp @@ -11,13 +11,13 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B, const u_int n_seq_tokens = T / B; sycl::range<1> block_dims((C / H)); sycl::range<1> grid_dims((B * H)); - stream->submit([&](sycl::handler & cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { /* local memory accessors*/ auto _k = sycl::local_accessor(sycl::range<1>(head_size), cgh); auto _r = sycl::local_accessor(sycl::range<1>(head_size), cgh); auto _td = sycl::local_accessor(sycl::range<1>(head_size), cgh); - cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) { + sycl_parallel_for<1>(cgh, sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) { u_int tid = item.get_local_id(0); u_int bid = item.get_group(0); diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp index aa19c2527dc41..52737cc746dfa 100644 --- a/ggml/src/ggml-sycl/im2col.cpp +++ b/ggml/src/ggml-sycl/im2col.cpp @@ -70,7 +70,7 @@ static void im2col_sycl_internal(const float * x, T * dst, int64_t IW, int64_t I const int64_t CHW = IC * KH * KW; - stream->parallel_for(sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) { im2col_kernel(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, CHW, s0, s1, p0, p1, d0, d1, item_ct1); }); diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp index ffb272aa28378..c72fcd38ebeff 100644 --- a/ggml/src/ggml-sycl/mmq.cpp +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -1818,7 +1818,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_qs_q4_0_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_d_q4_0_acc_ct1( @@ -1829,9 +1829,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q4_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -1853,7 +1852,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_qs_q4_0_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_d_q4_0_acc_ct1( @@ -1864,9 +1863,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q4_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -1933,7 +1931,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_qs_q4_1_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); sycl::local_accessor tile_x_dm_q4_1_acc_ct1( @@ -1944,9 +1942,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q4_1( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -1968,7 +1965,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_qs_q4_1_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); sycl::local_accessor tile_x_dm_q4_1_acc_ct1( @@ -1979,9 +1976,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q4_1( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2048,7 +2044,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q5_0_acc_ct1( sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_d_q5_0_acc_ct1( @@ -2059,9 +2055,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q5_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2083,7 +2078,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q5_0_acc_ct1( sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_d_q5_0_acc_ct1( @@ -2094,9 +2089,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q5_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2163,7 +2157,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q5_1_acc_ct1( sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q5_1_acc_ct1( @@ -2174,9 +2168,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q5_1( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2198,7 +2191,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q5_1_acc_ct1( sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q5_1_acc_ct1( @@ -2209,9 +2202,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q5_1( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2278,7 +2270,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_qs_q8_0_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_d_q8_0_acc_ct1( @@ -2289,9 +2281,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q8_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2313,7 +2304,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_qs_q8_0_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_d_q8_0_acc_ct1( @@ -2324,9 +2315,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q8_0( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2393,7 +2383,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q2_K_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q2_K_acc_ct1( @@ -2406,9 +2396,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q2_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2431,7 +2420,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q2_K_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q2_K_acc_ct1( @@ -2444,9 +2433,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q2_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2516,7 +2504,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q3_K_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q3_K_acc_ct1( @@ -2531,9 +2519,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q3_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2557,7 +2544,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q3_K_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q3_K_acc_ct1( @@ -2572,9 +2559,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q3_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2644,7 +2630,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q4_K_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q4_K_acc_ct1( @@ -2657,9 +2643,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q4_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2682,7 +2667,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q4_K_acc_ct1( sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q4_K_acc_ct1( @@ -2695,9 +2680,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q4_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2765,7 +2749,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q5_K_acc_ct1( sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q5_K_acc_ct1( @@ -2778,9 +2762,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q5_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2803,7 +2786,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_q5_K_acc_ct1( sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_q5_K_acc_ct1( @@ -2816,9 +2799,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q5_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2886,7 +2868,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_acc_ct1( sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_acc_ct1( @@ -2899,9 +2881,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q6_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, @@ -2924,7 +2905,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor tile_x_ql_acc_ct1( sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); sycl::local_accessor tile_x_dm_acc_ct1( @@ -2937,9 +2918,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, sycl::local_accessor tile_y_ds_acc_ct1( sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { mul_mat_q6_K( vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index cb70f83a4f9a6..c21929d51e94c 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -29,24 +29,23 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r static_assert(blocks_per_subgroup > 0); static_assert(block_elements_per_subgroup > 0); - const block_q8_1 * y = (const block_q8_1 *) vy; - float partial_sum = 0.0f; for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) { - const int ibx = row * blocks_per_row + i; // x block index - // TODO: Generalize offsets, right now only works for quantizations that don't split high and low bits - const int bx_offset = block_type::get_block_offset(ibx); - const int d_offset = block_type::get_d_offset(nrows, ncols, ibx); + const int ibx = row * blocks_per_row + i; // x block index + const auto bx_offset = block_type::get_block_offset(ibx, nblocks); + const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx); // Y block index that aligns with ibx const int iby = i * block_type::block_to_q8_1_ratio(); + const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1; + const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2)); #pragma unroll for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) { // x block quant index when casting the quants to int const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup); - partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks); + partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs); } } @@ -545,12 +544,12 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE)); const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); - stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), - [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, - nd_item); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, + nd_item); + }); }); } @@ -562,12 +561,12 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -581,17 +580,12 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -605,17 +599,12 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -629,17 +618,12 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -653,17 +637,12 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -677,17 +656,12 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -701,17 +675,12 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -725,17 +694,12 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -751,12 +715,12 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); - stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), - [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_reorder>(vx, vy, dst, ncols, - nrows, nd_item); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, + nd_item); + }); }); } @@ -770,21 +734,34 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } +static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, + const int nrows, dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, + nd_item); + }); + }); +} static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -794,17 +771,12 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -819,14 +791,12 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq2_xxs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq2_xxs_q8_1(vx, vy, dst, ncols, + nrows, item_ct1); + }); }); } } @@ -840,14 +810,12 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq2_xs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq2_xs_q8_1(vx, vy, dst, ncols, + nrows, item_ct1); + }); }); } } @@ -861,15 +829,12 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq2_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq2_s_q8_1(vx, vy, dst, ncols, nrows, + item_ct1); + }); }); } } @@ -883,15 +848,12 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq3_xxs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq3_xxs_q8_1(vx, vy, dst, ncols, + nrows, item_ct1); + }); }); } } @@ -905,15 +867,12 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq3_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq3_s_q8_1(vx, vy, dst, ncols, nrows, + item_ct1); + }); }); } } @@ -927,15 +886,12 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq1_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq1_s_q8_1(vx, vy, dst, ncols, nrows, + item_ct1); + }); }); } } @@ -949,14 +905,12 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq1_m_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq1_m_q8_1(vx, vy, dst, ncols, nrows, + item_ct1); + }); }); } } @@ -970,15 +924,12 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq4_nl_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq4_nl_q8_1(vx, vy, dst, ncols, nrows, + item_ct1); + }); }); } } @@ -992,15 +943,12 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q_iq4_xs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_iq4_xs_q8_1(vx, vy, dst, ncols, + nrows, item_ct1); + }); }); } } @@ -1070,7 +1018,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); break; case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && + ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } else { + GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_k_q8_1_sycl\n"); + mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_IQ1_S: mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 4ec1416849c7e..79d846b41a15d 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -254,14 +254,13 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i GGML_ASSERT(ncols % WARP_SIZE == 0); if (ncols < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); - stream->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl::nd_range<3>(global_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE); - }); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, + nullptr, WARP_SIZE); + }); + }); } else { const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; @@ -272,16 +271,15 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor s_sum_acc_ct1( sycl::range<1>(work_group_size / WARP_SIZE), cgh); - cgh.parallel_for( - sycl::nd_range<3>(global_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); - }); - }); + sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, + get_pointer(s_sum_acc_ct1), work_group_size); + }); + }); } } @@ -290,18 +288,14 @@ static void group_norm_f32_sycl(const float* x, float* dst, const int ne_elements, queue_ptr stream, int device) { if (group_size < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { const float eps_ct4 = eps; - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - group_norm_f32( - x, dst, group_size, ne_elements, eps_ct4, item_ct1, - nullptr, WARP_SIZE); - }); - }); + sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, nullptr, + WARP_SIZE); + }); + }); } else { const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; @@ -313,22 +307,18 @@ static void group_norm_f32_sycl(const float* x, float* dst, info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), cgh); const float eps_ct4 = eps; - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - group_norm_f32(x, dst, group_size, ne_elements, - eps_ct4, item_ct1, - get_pointer(s_sum_acc_ct1), work_group_size); - }); - }); + sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, + get_pointer(s_sum_acc_ct1), work_group_size); + }); + }); } } @@ -340,14 +330,13 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const const sycl::range<3> global_dims(nsamples, nchannels, nrows); if (ncols < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); - stream->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl::nd_range<3>(global_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE); - }); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, + nullptr, WARP_SIZE); + }); + }); } else { const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; @@ -358,16 +347,15 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), cgh); - cgh.parallel_for( - sycl::nd_range<3>(global_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); - }); - }); + sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, + get_pointer(s_sum_acc_ct1), work_group_size); + }); + }); } } @@ -378,16 +366,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols, // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); if (ncols < 1024) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); - stream->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - l2_norm_f32(x, dst, ncols, eps, item_ct1, - nullptr, WARP_SIZE); - }); - }); + sycl_launch(stream, [&](sycl::handler & cgh) { + sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + l2_norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE); + }); + }); } else { const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; @@ -398,18 +382,15 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols, the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), cgh); - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - l2_norm_f32(x, dst, ncols, eps, item_ct1, - get_pointer(s_sum_acc_ct1), work_group_size); - }); - }); + sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + l2_norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1), + work_group_size); + }); + }); } } diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp index 88ec13ea26999..8b952db43bfe2 100644 --- a/ggml/src/ggml-sycl/quants.hpp +++ b/ggml/src/ggml-sycl/quants.hpp @@ -14,12 +14,13 @@ #ifndef GGML_SYCL_QUANTS_HPP #define GGML_SYCL_QUANTS_HPP +#include + #include "ggml-common.h" #include "ggml.h" namespace ggml_sycl_reordered { - // The reordered block moves quants (qs) and scales(d) to two // uniform regions of memory that is contiguous in the same tensor. // What this means is that instead of having: @@ -32,7 +33,6 @@ namespace ggml_sycl_reordered { template struct block_q_t; - // qk number of weights / quants in a block // qr number of weights in a byte (described as 'before dequantization') // for quantization types that has low and high bits split, qr is calculated with @@ -47,10 +47,12 @@ template <> struct block_q_t { static constexpr uint32_t vdr_mmvq = 2; }; - static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); } + static constexpr std::pair get_block_offset(const int block_index, const int /* nblocks */) { + return { block_index * (traits::qk / traits::qr), 0 }; + } - static constexpr int get_d_offset(int nrows, int ncols, const int block_index) { - return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half); + static constexpr std::pair get_d_offset(int nrows, int ncols, const int block_index) { + return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 }; } static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } @@ -64,20 +66,46 @@ template <> struct block_q_t { static constexpr uint32_t vdr_mmvq = 2; }; - static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); } + static constexpr std::pair get_block_offset(const int block_index, const int /* nblocks */) { + return { block_index * (traits::qk / traits::qr), 0 }; + } - static constexpr int get_d_offset(int nrows, int ncols, const int block_index) { + static constexpr std::pair get_d_offset(int nrows, int ncols, const int block_index) { auto nblocks = (nrows * (ncols / traits::qk)); - return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)); + return { nblocks * (QK_K / 2), + (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) }; } static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; } - - constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; } }; +template <> struct block_q_t { + struct traits { + static constexpr uint32_t qk = QK_K; + static constexpr uint32_t qi = QI6_K; + static constexpr uint32_t qr = QR6_K; + static constexpr uint32_t vdr_mmvq = 1; + }; + + static constexpr std::pair get_block_offset(const int block_index, const int n_blocks) { + auto low_bits_index = block_index * (traits::qk / traits::qr); + // the index of high bits it's after all low bits + auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4)); + return { low_bits_index, high_bits_index }; + } + + static constexpr std::pair get_d_offset(int nrows, int ncols, const int block_index) { + auto nblocks = (nrows * (ncols / traits::qk)); + auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4); + auto block_scales = total_qs_bytes + block_index * (QK_K / 16); + auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16); + return { block_scales, sb_scale }; + } + + static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } +}; } // namespace ggml_sycl_reordered #endif // GGML_SYCL_QUANTS_HPP diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 44473e1e5580c..e44c6b6ef8f42 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -235,20 +235,22 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - rope_norm(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_norm(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, item_ct1); + }); } else { /* DPCT1049:41: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - rope_norm(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_norm(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, item_ct1); + }); } } @@ -267,15 +269,17 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); if (freq_factors == nullptr) { - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, item_ct1); + }); } else { - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, - theta_scale, freq_factors, item_ct1); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, item_ct1); + }); } } @@ -298,12 +302,12 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1, } // launch kernel if (freq_factors == nullptr) { - stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) { rope_multi(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, item_ct1); }); } else { - stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) { rope_multi(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, item_ct1); }); @@ -333,12 +337,12 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1, } // launch kernel if (freq_factors == nullptr) { - stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) { rope_vision(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, item_ct1); }); } else { - stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) { rope_vision(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, item_ct1); }); diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index 52fcf4b3dbd24..7b60c292e0c92 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -127,11 +127,11 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims, const size_t n_local_scratch, queue_ptr stream) { - stream->submit([&](sycl::handler &cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor local_buf_acc(n_local_scratch, cgh); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), + sycl_parallel_for( + cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { soft_max_f32(x, mask, dst, ncols_par, nrows_y, scale, max_bias, m0, diff --git a/ggml/src/ggml-sycl/sycl_hw.cpp b/ggml/src/ggml-sycl/sycl_hw.cpp index da121ffc261e8..7041140034b45 100644 --- a/ggml/src/ggml-sycl/sycl_hw.cpp +++ b/ggml/src/ggml-sycl/sycl_hw.cpp @@ -1,6 +1,7 @@ #include "sycl_hw.hpp" - +// TODO: currently not used +/* sycl_hw_info get_device_hw_info(sycl::device *device_ptr) { sycl_hw_info res; int32_t id = device_ptr->get_info(); @@ -11,3 +12,4 @@ sycl_hw_info get_device_hw_info(sycl::device *device_ptr) { return res; } +*/ diff --git a/ggml/src/ggml-sycl/sycl_hw.hpp b/ggml/src/ggml-sycl/sycl_hw.hpp index bf689450ce61f..36b140bf03737 100644 --- a/ggml/src/ggml-sycl/sycl_hw.hpp +++ b/ggml/src/ggml-sycl/sycl_hw.hpp @@ -10,6 +10,8 @@ namespace syclex = sycl::ext::oneapi::experimental; +// TODO: currently not used +/* struct sycl_hw_info { syclex::architecture arch; int32_t device_id; @@ -18,6 +20,7 @@ struct sycl_hw_info { bool is_in_vector(std::vector &vec, int item); sycl_hw_info get_device_hw_info(sycl::device *device_ptr); +*/ #endif // SYCL_HW_HPP diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index f6ca626ea7a53..721c8fa6fa27e 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -45,14 +45,9 @@ static void timestep_embedding_f32_sycl( int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE; sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE); sycl::range<3> gridDim(1, ne00, num_blocks); - stream->parallel_for( - sycl::nd_range<3>( - gridDim * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - timestep_embedding_f32( - x, dst, nb1, dim, max_period, item_ct1 - ); - }); + sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + timestep_embedding_f32(x, dst, nb1, dim, max_period, item_ct1); + }); } void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index ed3699313466b..0a5d4999419c9 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -284,22 +284,23 @@ template <> struct reorder_vec_dot_q_sycl { return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y()); } - __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset, - const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) { - const uint8_t * bq4_0 = static_cast(vbq) + ibx_offset; - const ggml_half d = *(reinterpret_cast(static_cast(vbq) + d_offset)); + __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair ibx_offset, + const std::pair d_offset, const int8_t * q8_1_quant_ptr, + const sycl::half2 * q8_1_ds, const int & iqs) { + const uint8_t * bq4_0 = static_cast(vbq) + ibx_offset.first; + const ggml_half d = *(reinterpret_cast(static_cast(vbq) + d_offset.first)); int v[q4_0_traits::vdr_mmvq]; int u[2 * q4_0_traits::vdr_mmvq]; -#pragma unroll +#pragma unroll for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) { v[i] = get_int_from_uint8(bq4_0, iqs + i); - u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi); + u[2 * i + 0] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i); + u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi); } - return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds); + return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds); }; }; @@ -346,24 +347,115 @@ template <> struct reorder_vec_dot_q_sycl { using q4_k_block = ggml_sycl_reordered::block_q_t; using q4_k_traits = typename q4_k_block::traits; - float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset, - const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) { - const int ib = ibx_offset / (QK_K / 2); + __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair ibx_offset, + const std::pair d_offset, const int8_t * q8_1_quant_ptr, + const sycl::half2 * q8_1_ds, const int & iqs) { + const int ib = ibx_offset.first / (QK_K / 2); const uint8_t * base = static_cast(vbq); - const uint8_t * qs = base + ibx_offset; - const int total_qs_bytes = nblocks * (QK_K / 2); - const uint8_t * scs = base + total_qs_bytes + ib * K_SCALE_SIZE; - const ggml_half2 * dms = reinterpret_cast(base + d_offset); + const uint8_t * qs = base + ibx_offset.first; + const uint8_t * scs = base + d_offset.first + ib * K_SCALE_SIZE; + const ggml_half2 * dms = reinterpret_cast(base + d_offset.second); const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2)); const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4)); const uint16_t * scales = (const uint16_t *) scs; - return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs); + int v[2]; + int u[2 * QR4_K]; + float d8[QR4_K]; + + v[0] = q4[0]; + v[1] = q4[4]; + + uint16_t aux[2]; + const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2; + if (j < 2) { + aux[0] = scales[j + 0] & 0x3f3f; + aux[1] = scales[j + 2] & 0x3f3f; + } else { + aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2); + aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2); + } + + const uint8_t * sc = (const uint8_t *) aux; + const uint8_t * m = sc + 2; + + for (int i = 0; i < QR4_K; ++i) { + const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1; + sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i); + + d8[i] = ds_values[0]; + + const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4); + u[2 * i + 0] = q8[0]; + u[2 * i + 1] = q8[4]; + } + + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8); } }; +template <> struct reorder_vec_dot_q_sycl { + static constexpr ggml_type gtype = GGML_TYPE_Q6_K; + + using q6_k_block = ggml_sycl_reordered::block_q_t; + using q6_k_traits = typename q6_k_block::traits; + + __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u, + const int8_t * __restrict__ scales, const float d, + const float * __restrict__ d8) { + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4 * i]; + + const int vil = (vl >> (4 * i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4 * i)) << 4) & 0x30303030; + + const int vi = dpct::vectorized_binary((vil | vih), 0x20202020, + dpct::sub_sat()); // vi = (vil | vih) - 32 + + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d * sumf; + } + + __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair ibx_offset, + const std::pair d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds, + const int iqs) { + const int ib = ibx_offset.first / (QK_K / 2); + + const uint8_t * base = static_cast(vbq); + const uint8_t * ql = base + ibx_offset.first; + const uint8_t * qh = base + ibx_offset.second; + const int8_t * scales = reinterpret_cast(base + d_offset.first); + const ggml_half * d = (const ggml_half *) (base + d_offset.second) + ib; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4); + const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8); + const int vh_shift = 2 * ((iqs % (QI6_K / 2)) / (QI6_K / 4)); + + const int vl = get_int_from_uint8(ql, iqs); + const int vh = get_int_from_uint8(qh, (QI6_K / 4) * (iqs / (QI6_K / 2)) + iqs % (QI6_K / 4)) >> vh_shift; + + const int8_t * scs = scales + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1); + const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i); + d8[i] = ds_values[0]; + } + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8); + } +}; #define VDR_Q4_0_Q8_1_MMVQ 2 #define VDR_Q4_0_Q8_1_MMQ 4 diff --git a/ggml/src/ggml-sycl/wkv.cpp b/ggml/src/ggml-sycl/wkv.cpp index c10e2f7645e89..3ed5bbf355ad9 100644 --- a/ggml/src/ggml-sycl/wkv.cpp +++ b/ggml/src/ggml-sycl/wkv.cpp @@ -207,12 +207,11 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { // Submit kernel if (C / H == WKV_BLOCK_SIZE) { - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { rwkv_wkv6_f32_kernel( B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d, item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() @@ -220,12 +219,11 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { }); }); } else { - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { rwkv_wkv6_f32_kernel( B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d, item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() @@ -264,12 +262,11 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { // Submit kernel if (C / H == WKV_BLOCK_SIZE) { - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { rwkv_wkv7_f32_kernel( B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d, item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() @@ -277,12 +274,11 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { }); }); } else { - stream->submit([&](sycl::handler& cgh) { + sycl_launch(stream, [&](sycl::handler & cgh) { sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + sycl_parallel_for( + cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { rwkv_wkv7_f32_kernel( B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d, item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 4a88415f96eae..0bf4cb14f88c7 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -49,15 +49,7 @@ if (Vulkan_FOUND) ../../include/ggml-vulkan.h ) - set(VULKAN_SHADER_GEN_CMAKE_ARGS - -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR} - -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY} - ) - - set(VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS "") - if (CMAKE_BUILD_TYPE AND CMAKE_BUILD_TYPE MATCHES "Debug|Release|MinSizeRel|RelWithDebInfo") - list(APPEND VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS --config=${CMAKE_BUILD_TYPE}) - endif() + set(VULKAN_SHADER_GEN_CMAKE_ARGS "") # Test all shader extensions test_shader_extension_support( @@ -136,42 +128,54 @@ if (Vulkan_FOUND) set(HOST_CMAKE_TOOLCHAIN_FILE "") endif() - # Always use ExternalProject_Add approach include(ExternalProject) - # Add toolchain file if cross-compiling if (CMAKE_CROSSCOMPILING) list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE}) message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}") endif() - # Native build through ExternalProject_Add ExternalProject_Add( vulkan-shaders-gen SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders - CMAKE_ARGS ${VULKAN_SHADER_GEN_CMAKE_ARGS} - BUILD_COMMAND ${CMAKE_COMMAND} --build . ${VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS} - INSTALL_COMMAND ${CMAKE_COMMAND} --install . - INSTALL_DIR ${CMAKE_BINARY_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/$ + -DCMAKE_INSTALL_BINDIR=. + -DCMAKE_BUILD_TYPE=$ + ${VULKAN_SHADER_GEN_CMAKE_ARGS} + + BUILD_COMMAND ${CMAKE_COMMAND} --build . --config $ + BUILD_ALWAYS TRUE + + # NOTE: When DESTDIR is set using Makefile generators and + # "make install" triggers the build step, vulkan-shaders-gen + # would be installed into the DESTDIR prefix, so it is unset + # to ensure that does not happen. + + INSTALL_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR + ${CMAKE_COMMAND} --install . --config $ ) - ExternalProject_Add_StepTargets(vulkan-shaders-gen build install) set (_ggml_vk_host_suffix $,.exe,>) - set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix}) - set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp) - set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp) - set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders) - set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv) + set (_ggml_vk_genshaders_dir "${CMAKE_BINARY_DIR}/$") + set (_ggml_vk_genshaders_cmd "${_ggml_vk_genshaders_dir}/vulkan-shaders-gen${_ggml_vk_host_suffix}") + set (_ggml_vk_header "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp") + set (_ggml_vk_source "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp") + set (_ggml_vk_input_dir "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders") + set (_ggml_vk_output_dir "${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv") - file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp") - set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen) + file(GLOB _ggml_vk_shader_files CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.comp") - # Add build and install dependencies for all builds - set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install) + # Because external projects do not provide source-level tracking, + # the vulkan-shaders-gen sources need to be explicitly added to + # ensure that changes will cascade into shader re-generation. + + file(GLOB _ggml_vk_shaders_gen_sources + CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.cpp" + "${_ggml_vk_input_dir}/*.h") add_custom_command( OUTPUT ${_ggml_vk_header} - ${_ggml_vk_source} + ${_ggml_vk_source} COMMAND ${_ggml_vk_genshaders_cmd} --glslc ${Vulkan_GLSLC_EXECUTABLE} @@ -181,7 +185,10 @@ if (Vulkan_FOUND) --target-cpp ${_ggml_vk_source} --no-clean - DEPENDS ${_ggml_vk_shader_deps} + DEPENDS ${_ggml_vk_shader_files} + ${_ggml_vk_shaders_gen_sources} + vulkan-shaders-gen + COMMENT "Generate vulkan shaders" ) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ab0303646f505..99be5e45b2af7 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } #define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_NVIDIA 0x10de -#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32 +#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256 #define GGML_VK_MAX_NODES 8192 @@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } struct ggml_backend_vk_context; -struct vk_queue { - uint32_t queue_family_index; - vk::Queue queue; - vk::CommandPool pool; - uint32_t cmd_buffer_idx; - std::vector cmd_buffers; - - vk::PipelineStageFlags stage_flags; - - bool transfer_only; -}; +#define MAX_PARAMETER_COUNT 8 struct vk_pipeline_struct { std::string name; vk::ShaderModule shader_module; - vk::DescriptorSetLayout dsl; - std::vector descriptor_pools; - std::vector descriptor_sets; - uint32_t descriptor_set_idx; vk::PipelineLayout layout; vk::Pipeline pipeline; uint32_t push_constant_size; @@ -167,6 +153,45 @@ struct ggml_backend_vk_buffer_type_context { vk_device device; }; +struct vk_queue; + +// Stores command pool/buffers. There's an instance of this +// for each (context,queue) pair and for each (device,queue) pair. +struct vk_command_pool { + void init(vk_device& device, vk_queue *q_); + void destroy(vk::Device& device); + + vk::CommandPool pool; + uint32_t cmd_buffer_idx; + std::vector cmd_buffers; + + vk_queue *q; +}; + +// Prevent simultaneous submissions to the same queue. +// This could be per vk_queue if we stopped having two vk_queue structures +// sharing the same vk::Queue. +static std::mutex queue_mutex; + +struct vk_queue { + uint32_t queue_family_index; + vk::Queue queue; + + vk_command_pool cmd_pool; + + vk::PipelineStageFlags stage_flags; + + bool transfer_only; + + // copy everything except the cmd_pool + void copyFrom(vk_queue &other) { + queue_family_index = other.queue_family_index; + queue = other.queue; + stage_flags = other.stage_flags; + transfer_only = other.transfer_only; + } +}; + static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft); static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft); @@ -196,6 +221,7 @@ enum vk_device_architecture { AMD_RDNA1, AMD_RDNA2, AMD_RDNA3, + INTEL_XE2, }; static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { @@ -246,6 +272,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& } return vk_device_architecture::AMD_RDNA2; } + } else if (props.vendorID == VK_VENDOR_ID_INTEL) { + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool subgroup_size_control = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { + subgroup_size_control = true; + } + } + + if (!subgroup_size_control) { + return vk_device_architecture::OTHER; + } + + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + + props2.pNext = &subgroup_size_control_props; + device.getProperties2(&props2); + + if (subgroup_size_control_props.minSubgroupSize == 16) { + // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8. + // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value. + // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html + // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html + return vk_device_architecture::INTEL_XE2; + } } return vk_device_architecture::OTHER; } @@ -312,6 +366,8 @@ struct vk_device_struct { // set to true to indicate that some shaders need to be compiled after the dryrun bool need_compiles {}; + vk::DescriptorSetLayout dsl; + vk_matmul_pipeline pipeline_matmul_f32 {}; vk_matmul_pipeline pipeline_matmul_f32_f16 {}; vk_matmul_pipeline pipeline_matmul_bf16 {}; @@ -396,6 +452,7 @@ struct vk_device_struct { vk_pipeline pipeline_count_equal_i32; vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; + vk_pipeline pipeline_conv_transpose_1d_f32; vk_pipeline pipeline_pool2d_f32; vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; @@ -428,7 +485,6 @@ struct vk_device_struct { vk_pipeline pipeline_flash_attn_split_k_reduce; std::unordered_map pipelines; - std::unordered_map pipeline_descriptor_set_requirements; std::vector> pinned_memory; @@ -444,7 +500,7 @@ struct vk_device_struct { // for GGML_VK_PERF_LOGGER std::unique_ptr perf_logger; vk::QueryPool query_pool; - uint32_t num_queries; + int32_t num_queries; ~vk_device_struct() { VK_LOG_DEBUG("destroy device " << name); @@ -453,10 +509,8 @@ struct vk_device_struct { ggml_vk_destroy_buffer(sync_staging); - device.destroyCommandPool(compute_queue.pool); - if (!single_queue) { - device.destroyCommandPool(transfer_queue.pool); - } + compute_queue.cmd_pool.destroy(device); + transfer_queue.cmd_pool.destroy(device); for (auto& pipeline : pipelines) { if (pipeline.second.expired()) { @@ -468,10 +522,26 @@ struct vk_device_struct { } pipelines.clear(); + device.destroyDescriptorSetLayout(dsl); + device.destroy(); } }; +void vk_command_pool::init(vk_device& device, vk_queue *q_) { + cmd_buffer_idx = 0; + q = q_; + + vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index); + pool = device->device.createCommandPool(command_pool_create_info); +} + +void vk_command_pool::destroy(vk::Device& device) { + device.destroyCommandPool(pool); + pool = nullptr; + cmd_buffers.clear(); +} + struct vk_buffer_struct { vk::Buffer buffer = VK_NULL_HANDLE; vk::DeviceMemory device_memory = VK_NULL_HANDLE; @@ -706,6 +776,21 @@ struct vk_op_timestep_embedding_push_constants { uint32_t max_period; }; +struct vk_op_conv_transpose_1d_push_constants { + uint32_t Cout; + uint32_t Cin; + uint32_t K; + uint32_t L; + uint32_t KL; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb11; + uint32_t nb1; + + int32_t s0; +}; + struct vk_op_pool2d_push_constants { uint32_t IW; uint32_t IH; uint32_t OW; uint32_t OH; @@ -774,7 +859,7 @@ struct vk_context_struct { std::vector in_memcpys; std::vector out_memcpys; - vk_queue * q; + vk_command_pool * p {}; }; typedef std::shared_ptr vk_context; typedef std::weak_ptr vk_context_ref; @@ -885,6 +970,14 @@ struct ggml_backend_vk_context { vk_context_ref transfer_ctx; std::vector tensor_ctxs; + + std::vector descriptor_pools; + std::vector descriptor_sets; + uint32_t descriptor_set_idx {}; + uint32_t pipeline_descriptor_set_requirements {}; + + vk_command_pool compute_cmd_pool; + vk_command_pool transfer_cmd_pool; }; static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT @@ -948,6 +1041,14 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { struct vk_instance_t { vk::Instance instance; + bool debug_utils_support = false; // VK_EXT_debug_utils enabled + PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {}; + PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {}; + PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {}; + PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {}; + PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {}; + PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {}; + std::vector device_indices; vk_device devices[GGML_VK_MAX_DEVICES]; }; @@ -1015,39 +1116,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")"); GGML_ASSERT(parameter_count > 0); + GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast(spv_data)); pipeline->shader_module = device->device.createShaderModule(shader_module_create_info); - std::vector dsl_binding; - std::vector dsl_binding_flags; - for (uint32_t i = 0; i < parameter_count; i++) { - dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}); - dsl_binding_flags.push_back({}); - } - - vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags }; - vk::PushConstantRange pcr( vk::ShaderStageFlagBits::eCompute, 0, pipeline->push_constant_size ); - vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info( - {}, - dsl_binding); - descriptor_set_layout_create_info.setPNext(&dslbfci); - pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info); - - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size); - pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); - - pipeline->descriptor_set_idx = 0; - - vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr); + vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr); pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info); std::vector specialization_entries(specialization_constants.size()); @@ -1107,6 +1188,14 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin } pipeline->compiled = true; + if (vk_instance.debug_utils_support) { + vk::DebugUtilsObjectNameInfoEXT duoni; + duoni.objectType = vk::ObjectType::ePipeline; + duoni.pObjectName = pipeline->name.c_str(); + duoni.objectHandle = reinterpret_cast(static_cast(pipeline->pipeline)); + vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast(duoni)); + } + { std::lock_guard guard(device->mutex); device->pipelines.insert({ pipeline->name, pipeline }); @@ -1122,15 +1211,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")"); - for (auto& pool : pipeline->descriptor_pools) { - device.destroyDescriptorPool(pool); - } - pipeline->descriptor_pools.clear(); - pipeline->descriptor_sets.clear(); - pipeline->descriptor_set_idx = 0; - - device.destroyDescriptorSetLayout(pipeline->dsl); - device.destroyPipelineLayout(pipeline->layout); device.destroyShaderModule(pipeline->shader_module); @@ -1138,97 +1218,77 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) device.destroyPipeline(pipeline->pipeline); } -static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) { +static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) { VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")"); - device->pipeline_descriptor_set_requirements[pipeline->name] += n; + ctx->pipeline_descriptor_set_requirements += n; if (!pipeline->compiled) { pipeline->needed = true; - device->need_compiles = true; + ctx->device->need_compiles = true; } } -static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) { - std::lock_guard guard(device->mutex); +static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) { - for (auto& pair : device->pipeline_descriptor_set_requirements) { - vk_pipeline pipeline = device->pipelines.at(pair.first).lock(); - const uint64_t n = pair.second; - - VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")"); - - if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) { - // Enough descriptors are available - continue; - } + if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) { + // Enough descriptors are available + return; + } - uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size(); - uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE; - uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE; + vk_device& device = ctx->device; - while (to_alloc > 0) { - const uint32_t alloc_count = std::min(pool_remaining, to_alloc); - to_alloc -= alloc_count; - pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE; + uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size(); + uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE; + uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE; - if (pool_idx >= pipeline->descriptor_pools.size()) { - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size); - pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); - } + while (to_alloc > 0) { + const uint32_t alloc_count = std::min(pool_remaining, to_alloc); + to_alloc -= alloc_count; + pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE; - std::vector layouts(alloc_count); - for (uint32_t i = 0; i < alloc_count; i++) { - layouts[i] = pipeline->dsl; - } - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data()); - std::vector sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info); - pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end()); + if (pool_idx >= ctx->descriptor_pools.size()) { + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE); + vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size); + ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); + } - pool_idx++; + std::vector layouts(alloc_count); + for (uint32_t i = 0; i < alloc_count; i++) { + layouts[i] = device->dsl; } - } -} + vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data()); + std::vector sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info); + ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end()); -static void ggml_pipeline_cleanup(vk_pipeline& pipeline) { - VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")"); - pipeline->descriptor_set_idx = 0; + pool_idx++; + } } -static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) { +static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) { VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()"); - std::lock_guard guard(device->mutex); - if (q.cmd_buffers.size() > q.cmd_buffer_idx) { + if (p.cmd_buffers.size() > p.cmd_buffer_idx) { // Reuse command buffer - return q.cmd_buffers[q.cmd_buffer_idx++]; + return p.cmd_buffers[p.cmd_buffer_idx++]; } vk::CommandBufferAllocateInfo command_buffer_alloc_info( - q.pool, + p.pool, vk::CommandBufferLevel::ePrimary, 1); const std::vector cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info); auto buf = cmd_buffers.front(); - q.cmd_buffers.push_back(buf); - q.cmd_buffer_idx++; + p.cmd_buffers.push_back(buf); + p.cmd_buffer_idx++; return buf; } -static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { - VK_LOG_DEBUG("ggml_vk_create_submission()"); - vk_submission s; - s.buffer = ggml_vk_create_cmd_buffer(device, q); - s.wait_semaphores = std::move(wait_semaphores); - s.signal_semaphores = std::move(signal_semaphores); - return s; -} - static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { if (ctx->seqs.empty()) { if (fence) { - ctx->q->queue.submit({}, fence); + std::lock_guard guard(queue_mutex); + ctx->p->q->queue.submit({}, fence); } return; } @@ -1267,7 +1327,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { tl_signal_vals.push_back({}); tl_signal_semaphores.push_back({}); for (size_t i = 0; i < submission.wait_semaphores.size(); i++) { - stage_flags[idx].push_back(ctx->q->stage_flags); + stage_flags[idx].push_back(ctx->p->q->stage_flags); tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value); tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s); } @@ -1297,7 +1357,8 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { } } - ctx->q->queue.submit(submit_infos, fence); + std::lock_guard guard(queue_mutex); + ctx->p->q->queue.submit(submit_infos, fence); ctx->seqs.clear(); } @@ -1355,28 +1416,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_ q.queue_family_index = queue_family_index; q.transfer_only = transfer_only; - vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index); - q.pool = device->device.createCommandPool(command_pool_create_info_compute); - - q.cmd_buffer_idx = 0; + q.cmd_pool.init(device, &q); q.queue = device->device.getQueue(queue_family_index, queue_index); q.stage_flags = stage_flags; } -static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) { +static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) { vk_context result = std::make_shared(); VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")"); ctx->gc.contexts.emplace_back(result); - result->q = &q; + result->p = &p; return result; } -static vk_context ggml_vk_create_temporary_context(vk_queue& q) { +static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) { vk_context result = std::make_shared(); VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")"); - result->q = &q; + result->p = &p; return result; } @@ -1409,15 +1467,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { return ctx->gc.events[ctx->event_idx++]; } -static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) { - VK_LOG_DEBUG("ggml_vk_queue_cleanup()"); - std::lock_guard guard(device->mutex); +static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) { + VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()"); // Requires command buffers to be done - device->device.resetCommandPool(q.pool); - q.cmd_buffer_idx = 0; + device->device.resetCommandPool(p.pool); + p.cmd_buffer_idx = 0; } +static void ggml_vk_queue_command_pools_cleanup(vk_device& device) { + VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()"); + + // Arbitrary frequency to cleanup/reuse command buffers + static constexpr uint32_t cleanup_frequency = 10; + + if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) { + ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool); + } + if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) { + ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool); + } +} + + static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) { for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) { vk::MemoryType memory_type = mem_props->memoryTypes[i]; @@ -1436,8 +1508,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit"); } - std::lock_guard guard(device->mutex); - vk_buffer buf = std::make_shared(); if (size == 0) { @@ -1566,11 +1636,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { static void ggml_vk_sync_buffers(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_sync_buffers()"); - const bool transfer_queue = ctx->q->transfer_only; + const bool transfer_queue = ctx->p->q->transfer_only; ctx->s->buffer.pipelineBarrier( - ctx->q->stage_flags, - ctx->q->stage_flags, + ctx->p->q->stage_flags, + ctx->p->q->stage_flags, {}, { { { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }, @@ -1589,8 +1659,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events ctx->s->buffer.waitEvents( events, - ctx->q->stage_flags, - ctx->q->stage_flags, + ctx->p->q->stage_flags, + ctx->p->q->stage_flags, {}, {}, {} @@ -1652,7 +1722,7 @@ static std::array fa_rows_cols(FaCodePath path, uint32_t D, uint32_ return {64, 32}; } return {64, 64}; -}; +} static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id, ggml_type src0_type) { @@ -2726,6 +2796,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); @@ -3322,6 +3394,22 @@ static vk_device ggml_vk_get_device(size_t idx) { } } + + std::vector dsl_binding; + std::vector dsl_binding_flags; + for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) { + dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}); + dsl_binding_flags.push_back({}); + } + + vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags }; + + vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info( + {}, + dsl_binding); + descriptor_set_layout_create_info.setPNext(&dslbfci); + device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info); + ggml_vk_load_shaders(device); if (!device->single_queue) { @@ -3329,7 +3417,8 @@ static vk_device ggml_vk_get_device(size_t idx) { ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true); } else { // TODO: Use pointer or reference to avoid copy - device->transfer_queue = device->compute_queue; + device->transfer_queue.copyFrom(device->compute_queue); + device->transfer_queue.cmd_pool.init(device, &device->transfer_queue); } device->buffer_type = { @@ -3488,6 +3577,8 @@ static void ggml_vk_print_gpu_info(size_t idx) { static bool ggml_vk_instance_validation_ext_available(const std::vector& instance_extensions); static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); +static bool ggml_vk_instance_debug_utils_ext_available(const std::vector & instance_extensions); + static void ggml_vk_instance_init() { if (vk_instance_initialized) { return; @@ -3508,7 +3599,7 @@ static void ggml_vk_instance_init() { #ifdef __APPLE__ const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); #endif - + const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr; std::vector layers; if (validation_ext) { @@ -3523,6 +3614,9 @@ static void ggml_vk_instance_init() { extensions.push_back("VK_KHR_portability_enumeration"); } #endif + if (debug_utils_ext) { + extensions.push_back("VK_EXT_debug_utils"); + } vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions); #ifdef __APPLE__ if (portability_enumeration_ext) { @@ -3546,13 +3640,25 @@ static void ggml_vk_instance_init() { vk_instance.instance = vk::createInstance(instance_create_info); vk_instance_initialized = true; - vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; + if (debug_utils_ext) { + vk_instance.debug_utils_support = true; + vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT"); + vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT"); + vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT"); + vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT"); + + } size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); + vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES"); if (devices_env != nullptr) { + size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); + std::string devices(devices_env); std::replace(devices.begin(), devices.end(), ',', ' '); @@ -3568,9 +3674,9 @@ static void ggml_vk_instance_init() { } else { std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); - // Make sure at least one device exists + // If no vulkan devices are found, return early if (devices.empty()) { - std::cerr << "ggml_vulkan: Error: No devices found." << std::endl; + GGML_LOG_INFO("ggml_vulkan: No devices found.\n"); return; } @@ -3653,9 +3759,20 @@ static void ggml_vk_instance_init() { } } - // If no dedicated GPUs found, fall back to GPU 0 + // If no dedicated GPUs found, fall back to the first non-CPU device. + // If only CPU devices are available, return without devices. if (vk_instance.device_indices.empty()) { - vk_instance.device_indices.push_back(0); + for (size_t i = 0; i < devices.size(); i++) { + if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) { + vk_instance.device_indices.push_back(i); + break; + } + } + } + + if (vk_instance.device_indices.empty()) { + GGML_LOG_INFO("ggml_vulkan: No devices found.\n"); + return; } } GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size()); @@ -3684,6 +3801,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { ctx->fence = ctx->device->device.createFence({}); ctx->almost_ready_fence = ctx->device->device.createFence({}); + ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue); + ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue); + #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); @@ -4049,9 +4169,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf } } -static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) { +static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) { vk_submission s; - s.buffer = ggml_vk_create_cmd_buffer(device, q); + s.buffer = ggml_vk_create_cmd_buffer(device, p); if (one_time) { s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); } else { @@ -4061,7 +4181,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo return s; } -static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array elements) { +template size_t push_constant_size(const T &t) { + static_assert(std::is_class::value, "T must be a struct/class"); + GGML_UNUSED(t); + return sizeof(T); +} +template size_t push_constant_size(const std::vector &t) { + GGML_UNUSED(t); + return sizeof(T) * t.size(); +} +template size_t push_constant_size(const std::array &t) { + GGML_UNUSED(t); + return sizeof(T) * N; +} + +template const T *push_constant_data(const T &t) { + static_assert(std::is_class::value, "T must be a struct/class"); + return &t; +} +template const T *push_constant_data(const std::vector &t) { + return t.data(); +} +template const T *push_constant_data(const std::array &t) { + return t.data(); +} + +template +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, const T &push_constants, std::array elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); @@ -4070,14 +4216,14 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), "; } std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); - GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); - GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count); + GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size()); + GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT); - vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; + vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++]; vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); - subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); + subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants)); subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeline->layout, @@ -4110,7 +4256,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) { ggml_vk_ctx_end(subctx); } - subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) }); + subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) }); subctx->s = subctx->seqs[subctx->seqs.size() - 1].data(); } @@ -4311,7 +4457,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); } } else { - vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); + std::lock_guard guard(dst->device->mutex); + + vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true); ggml_vk_ctx_end(subctx); @@ -4323,6 +4471,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); dst->device->device.resetFences({ dst->device->fence }); + ggml_vk_queue_command_pools_cleanup(dst->device); } } @@ -4399,7 +4548,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_ memcpy(dst, (uint8_t *) src->ptr + offset, size); } else { - vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); + std::lock_guard guard(src->device->mutex); + + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(src->device, subctx); ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true); ggml_vk_ctx_end(subctx); @@ -4407,6 +4558,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_ ggml_vk_submit(subctx, src->device->fence); VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); src->device->device.resetFences({ src->device->fence }); + ggml_vk_queue_command_pools_cleanup(src->device); for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); @@ -4426,15 +4578,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { if (src->device == dst->device) { + std::lock_guard guard(src->device->mutex); VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); // Copy within the device - vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue); + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(src->device, subctx); ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, src->device->fence); VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); src->device->device.resetFences({ src->device->fence }); + ggml_vk_queue_command_pools_cleanup(src->device); } else { VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")"); // Copy device to device @@ -4459,7 +4613,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); - vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue); + std::lock_guard guard(dst->device->mutex); + vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); ggml_vk_ctx_end(subctx); @@ -4467,6 +4622,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences"); dst->device->device.resetFences({ dst->device->fence }); + ggml_vk_queue_command_pools_cleanup(dst->device); } static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) { @@ -4540,7 +4696,7 @@ static void ggml_vk_matmul( ggml_vk_sync_buffers(subctx); if (split_k == 1) { const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch }); return; } @@ -4548,10 +4704,10 @@ static void ggml_vk_matmul( const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n }; // Make sure enough workgroups get assigned for split k to work - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); ggml_vk_sync_buffers(subctx); const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 }); } static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) { @@ -4599,7 +4755,7 @@ static void ggml_vk_matmul_id( ggml_vk_sync_buffers(subctx); const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, nei0, nei1, nbi1, ne11, padded_n }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as }); } static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { @@ -4720,7 +4876,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& }; init_pushconst_fastdiv(pc); ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); } static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) { @@ -4739,7 +4895,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(uint32_t), &ne, { ne, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array{ne}, { ne, 1, 1 }); } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -4880,18 +5036,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } // Request descriptor sets - ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); if (qx_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } if (quantize_y) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); } if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1); + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1); } return; } @@ -4939,7 +5095,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -5073,12 +5229,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& // Request descriptor sets if (qx_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } - ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1); + ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1); return; } @@ -5155,7 +5311,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, - sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); + pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); } static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -5211,7 +5367,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c if (dryrun) { // Request descriptor sets - ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1); + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1); return; } @@ -5243,7 +5399,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, workgroups_z }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z }); } static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -5300,7 +5456,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con if (dryrun) { // Request descriptor sets - ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1); + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1); return; } @@ -5326,7 +5482,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, - { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -5487,12 +5643,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& } // Request descriptor sets - ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); if (qx_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } return; } @@ -5542,7 +5698,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, - { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -5681,12 +5837,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte // Request descriptor sets if (qx_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { - ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1); + ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } - ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1); + ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1); return; } @@ -5762,7 +5918,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, - sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); + pc, { groups_x, (uint32_t)nei0, groups_z }); } static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { @@ -6006,9 +6162,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx if (dryrun) { // Request descriptor sets - ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1); + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1); } return; } @@ -6112,7 +6268,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx // there's no more than one tile of rows (i.e. workgroups_x would have been // one). We reuse workgroups_x to mean the number of splits, so we need to // cancel out the divide by wg_denoms[0]. - sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); + pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); ggml_vk_sync_buffers(subctx); const std::array pc2 = { D, (uint32_t)ne1, split_k }; @@ -6121,7 +6277,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, - pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 }); + pc2, { (uint32_t)ne1, 1, 1 }); } else { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { @@ -6131,7 +6287,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, - sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z }); + pc, { workgroups_x, workgroups_y, workgroups_z }); } } @@ -6392,6 +6548,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_timestep_embedding_f32; } return nullptr; + case GGML_OP_CONV_TRANSPOSE_1D: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_conv_transpose_1d_f32; + } + return nullptr; case GGML_OP_POOL_2D: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_pool2d_f32; @@ -6566,7 +6727,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); return; } @@ -6726,6 +6887,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co uint32_t half_ceil = (dim + 1) / 2; elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1} + } break; case GGML_OP_POOL_2D: { const uint32_t N = dst->ne[3]; @@ -6800,7 +6965,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) { // Empty src2 is possible in rope, but the shader needs a buffer vk_subbuffer subbuf_z; @@ -6811,26 +6976,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_IM2COL) { // im2col uses only src1 and dst buffers ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_COUNT_EQUAL) { ggml_vk_sync_buffers(subctx); // count_equal assumes that destination buffer is initialized with zeroes ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz); ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (use_src2) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } } @@ -6943,7 +7108,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx GGML_ASSERT(pipeline != nullptr); if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); return; } @@ -6999,7 +7164,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] }, vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] }, vk_subbuffer{ d_D, dst_offset, dst_size } - }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements); + }, pc, elements); } else if (version == 7) { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] }, @@ -7010,7 +7175,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] }, vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] }, vk_subbuffer{ d_D, dst_offset, dst_size } - }, sizeof(vk_op_rwkv_wkv7_push_constants), &pc, elements); + }, pc, elements); } else { // shouldn't happen GGML_ASSERT(false); @@ -7082,7 +7247,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont GGML_ASSERT(pipeline != nullptr); if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); return; } @@ -7147,7 +7312,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont vk_subbuffer{ d_GM, gm_offset, gm_size }, vk_subbuffer{ d_GV, gv_offset, gv_size }, vk_subbuffer{ d_P, p_offset, p_size }, - }, sizeof(vk_op_push_constants), &pc, elements); + }, pc, elements); } static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { @@ -7529,6 +7694,37 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context }, dryrun); } +static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + // src0: (K, Cout, Cin, 1) -- kernel + // src1: (L, Cin, 1, 1) -- input + // dst: (*, Cout, 1, 1) + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + const int32_t s0 = dst->op_params[0]; + + vk_op_conv_transpose_1d_push_constants p{}; + p.Cout = static_cast(ne01); + p.Cin = static_cast(ne02); + p.K = static_cast(ne00); + p.L = static_cast(ne10); + p.KL = static_cast(ne0); + p.nb01 = static_cast(nb01 / nb00); + p.nb02 = static_cast(nb02 / nb00); + p.nb11 = static_cast(nb11 / nb10); + p.nb1 = static_cast(nb1 / nb0); + p.s0 = static_cast(s0); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun); +} + static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { uint32_t op = static_cast(dst->op_params[0]); const int32_t k1 = dst->op_params[1]; @@ -7729,9 +7925,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } } - ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it); + ggml_pipeline_request_descriptor_sets(ctx, p, num_it); if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it); + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer @@ -7746,7 +7942,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_vk_load_shaders(ctx->device); } - ggml_pipeline_allocate_descriptor_sets(ctx->device); + ggml_pipeline_allocate_descriptor_sets(ctx); vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); @@ -7788,7 +7984,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch); ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); - vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ggml_vk_ctx_begin(ctx->device, subctx); for (size_t i = 0; i < num_it; i++) { ggml_vk_matmul( @@ -7804,6 +8000,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_vk_submit(subctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); ctx->device->device.resetFences({ ctx->fence }); + ggml_vk_queue_command_pools_cleanup(ctx->device); auto end = std::chrono::high_resolution_clock::now(); double time = std::chrono::duration_cast(end-begin).count() / 1000.0; @@ -7905,16 +8102,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t free(d_chk); - ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue); - ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue); + ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); + ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_Y); ggml_vk_destroy_buffer(d_D); - ggml_pipeline_cleanup(p); - ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce); - free(x); free(y); free(d); @@ -7992,20 +8186,20 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_vk_quantize_data(x, qx, ne, quant); ggml_vk_dequantize_data(qx, x_ref, ne, quant); - ggml_pipeline_request_descriptor_sets(ctx->device, p, 1); + ggml_pipeline_request_descriptor_sets(ctx, p, 1); if (ctx->device->need_compiles) { ggml_vk_load_shaders(ctx->device); } - ggml_pipeline_allocate_descriptor_sets(ctx->device); + ggml_pipeline_allocate_descriptor_sets(ctx); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); - vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ggml_vk_ctx_begin(ctx->device, subctx); const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; - ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1}); ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); @@ -8013,6 +8207,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_vk_submit(subctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); ctx->device->device.resetFences({ ctx->fence }); + ggml_vk_queue_command_pools_cleanup(ctx->device); auto end = std::chrono::high_resolution_clock::now(); @@ -8092,17 +8287,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ // // vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant); // -// ggml_pipeline_request_descriptor_sets(ctx->device, p, 1); +// ggml_pipeline_request_descriptor_sets(ctx, p, 1); // // if (ctx->device->need_compiles) { // ggml_vk_load_shaders(ctx->device); // } // -// ggml_pipeline_allocate_descriptor_sets(ctx->device); +// ggml_pipeline_allocate_descriptor_sets(ctx); // // ggml_vk_buffer_write(x_buf, 0, x, x_sz); // -// vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); +// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); // ggml_vk_ctx_begin(ctx->device, subctx); // ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne); // ggml_vk_ctx_end(subctx); @@ -8112,6 +8307,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ // ggml_vk_submit(subctx, ctx->fence); // VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences"); // ctx->device->device.resetFences({ ctx->fence }); +// ggml_vk_queue_command_pools_cleanup(ctx->device); // // auto end = std::chrono::high_resolution_clock::now(); // @@ -8251,9 +8447,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, // y[i] = i % k; } - ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it); + ggml_pipeline_request_descriptor_sets(ctx, p, num_it); if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it); + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer @@ -8264,19 +8460,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, } } if (mmq) { - ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it); + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it); } if (ctx->device->need_compiles) { ggml_vk_load_shaders(ctx->device); } - ggml_pipeline_allocate_descriptor_sets(ctx->device); + ggml_pipeline_allocate_descriptor_sets(ctx); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); ggml_vk_buffer_write(y_buf, 0, y, y_sz); - vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ggml_vk_ctx_begin(ctx->device, subctx); if (mmq) { for (size_t i = 0; i < num_it; i++) { @@ -8305,6 +8501,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, ggml_vk_submit(subctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); ctx->device->device.resetFences({ ctx->fence }); + ggml_vk_queue_command_pools_cleanup(ctx->device); auto end = std::chrono::high_resolution_clock::now(); @@ -8600,6 +8797,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_RWKV_WKV6: @@ -8618,7 +8816,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod if (!dryrun) { if (ctx->compute_ctx.expired()) { - compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ctx->compute_ctx = compute_ctx; ggml_vk_ctx_begin(ctx->device, compute_ctx); } else { @@ -8664,6 +8862,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_LEAKY_RELU: @@ -8671,7 +8870,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod // These operations all go through ggml_vk_op_f32, so short-circuit and // do the only thing needed for the dryrun. vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op); - ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); return false; } default: @@ -8835,6 +9034,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_TIMESTEP_EMBEDDING: ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_CONV_TRANSPOSE_1D: + ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun); + break; case GGML_OP_POOL_2D: ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun); @@ -8963,6 +9166,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_RWKV_WKV6: @@ -9058,19 +9262,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { } ctx->gc.temp_buffers.clear(); - for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) { - vk_pipeline_ref plr = ctx->device->pipelines[dsr.first]; - - if (plr.expired()) { - continue; - } - - vk_pipeline pl = plr.lock(); - ggml_pipeline_cleanup(pl); - } - - ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue); - ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue); + ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); + ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s }); @@ -9091,7 +9284,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { ctx->tensor_ctxs.clear(); ctx->gc.contexts.clear(); - ctx->device->pipeline_descriptor_set_requirements.clear(); + ctx->pipeline_descriptor_set_requirements = 0; + ctx->descriptor_set_idx = 0; } // Clean up on backend free @@ -9118,6 +9312,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->device->device.destroyFence(ctx->fence); ctx->device->device.destroyFence(ctx->almost_ready_fence); + + for (auto& pool : ctx->descriptor_pools) { + ctx->device->device.destroyDescriptorPool(pool); + } + ctx->descriptor_pools.clear(); + ctx->descriptor_sets.clear(); + + ctx->compute_cmd_pool.destroy(ctx->device->device); + ctx->transfer_cmd_pool.destroy(ctx->device->device); } static int ggml_vk_get_device_count() { @@ -9325,6 +9528,12 @@ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer UNUSED(buft); } +static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + return vk_instance.devices[0]->suballocation_block_size; + + UNUSED(buft); +} + // Should be changed to return device-specific host buffer type // but that probably requires changes in llama.cpp ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { @@ -9333,7 +9542,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .get_name = */ ggml_backend_vk_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_max_size = */ ggml_backend_vk_host_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, @@ -9384,7 +9593,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); ctx->transfer_ctx = transfer_ctx; ggml_vk_ctx_begin(ctx->device, transfer_ctx); } else { @@ -9407,7 +9616,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); ctx->transfer_ctx = transfer_ctx; ggml_vk_ctx_begin(ctx->device, transfer_ctx); } else { @@ -9430,7 +9639,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_ if (ctx->transfer_ctx.expired()) { // Initialize new transfer context - transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); + transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); ctx->transfer_ctx = transfer_ctx; ggml_vk_ctx_begin(ctx->device, transfer_ctx); } else { @@ -9480,6 +9689,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + if (vk_instance.debug_utils_support) { + vk::DebugUtilsLabelEXT dul = {}; + dul.pLabelName = "ggml_backend_vk_graph_compute"; + dul.color = std::array{1.0f, 1.0f, 1.0f, 1.0f}; + vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast(&dul)); + } + uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false); @@ -9491,7 +9707,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ggml_vk_load_shaders(ctx->device); } ggml_vk_preallocate_buffers(ctx); - ggml_pipeline_allocate_descriptor_sets(ctx->device); + ggml_pipeline_allocate_descriptor_sets(ctx); int last_node = cgraph->n_nodes - 1; @@ -9513,8 +9729,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (ctx->device->query_pool) { ctx->device->device.destroyQueryPool(ctx->device->query_pool); } - VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO }; - query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP; + vk::QueryPoolCreateInfo query_create_info; + query_create_info.queryType = vk::QueryType::eTimestamp; query_create_info.queryCount = cgraph->n_nodes + 100; ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info); ctx->device->num_queries = query_create_info.queryCount; @@ -9523,7 +9739,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1); GGML_ASSERT(ctx->compute_ctx.expired()); - compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ctx->compute_ctx = compute_ctx; ggml_vk_ctx_begin(ctx->device, compute_ctx); compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0); @@ -9558,7 +9774,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (vk_perf_logger_enabled) { if (ctx->compute_ctx.expired()) { - compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ctx->compute_ctx = compute_ctx; ggml_vk_ctx_begin(ctx->device, compute_ctx); } else { @@ -9600,7 +9816,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Get the results and pass them to the logger std::vector timestamps(cgraph->n_nodes + 1); - ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait); + VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); for (int i = 0; i < cgraph->n_nodes; i++) { if (!ggml_vk_is_empty(cgraph->nodes[i])) { ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod)); @@ -10024,6 +10240,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_LEAKY_RELU: case GGML_OP_OPT_STEP_ADAMW: return true; + case GGML_OP_CONV_TRANSPOSE_1D: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; default: return false; } @@ -10167,11 +10385,28 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve UNUSED(instance_extensions); } +// Extension availability +static bool ggml_vk_instance_debug_utils_ext_available( + const std::vector & instance_extensions) { + // Check for portability enumeration extension for MoltenVK support + for (const auto & properties : instance_extensions) { + if (strcmp("VK_EXT_debug_utils", properties.extensionName) == 0) { + return true; + } + } + + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_EXT_debug_utils not found." << std::endl; + return false; + + UNUSED(instance_extensions); +} + static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: - // Intel drivers don't support coopmat properly yet - return false; + // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost, + // while some older hardware (ex. Arc A770) has performance regressions + return arch == vk_device_architecture::INTEL_XE2; case VK_VENDOR_ID_AMD: if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { // Workaround for AMD proprietary driver reporting support on all GPUs @@ -10515,6 +10750,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { const int32_t dim = tensor->op_params[0]; const int32_t max_period = tensor->op_params[1]; tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period); + } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){ + const int32_t s0 = tensor->op_params[0]; + const int32_t p0 = tensor->op_params[1]; + const int32_t d0 = tensor->op_params[2]; + tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0); } else if (tensor->op == GGML_OP_POOL_2D) { enum ggml_op_pool op = static_cast(tensor->op_params[0]); const int32_t k0 = tensor->op_params[1]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index e60e9d1e5b5c5..14e9daaa01a25 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -25,15 +25,3 @@ add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_17) target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) - -# Configure output directories for MSVC builds -if(MSVC) - # Get the main project's runtime output directory if possible - if(DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY) - foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES}) - string(TOUPPER ${CONFIG} CONFIG) - set_target_properties(${TARGET} PROPERTIES - RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - endforeach() - endif() -endif() diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp new file mode 100644 index 0000000000000..b17b4e83eec4b --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp @@ -0,0 +1,98 @@ +#version 450 + +#include "types.comp" + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // src0 - kernel: [K, Cout, Cin] +layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; // src1 - input: [L, Cin] +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; // dst - result [KL, Cout] + +layout(local_size_x = 128 , local_size_y = 1, local_size_z = 1) in; + +layout (push_constant) uniform parameter { + uint32_t Cout; + uint32_t Cin; + uint32_t K; + uint32_t L; + uint32_t KL; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb11; + uint32_t nb1; + + int32_t s0; +} p; + + +uint32_t Cout_idx = gl_WorkGroupID.x; +const uint32_t bs = gl_WorkGroupSize.x; +uint32_t tid = gl_LocalInvocationID.x; +// Code is more straightforward if we assume it is bs*s0+K instead of (bs-1)*s0+K. +uint32_t tmp_len = bs*p.s0+p.K; +shared D_TYPE tmp[4096]; + +uint splitWork(uint workSize){ + return (bs + workSize -1) / bs; +} + +void main(){ + for(uint32_t i = 0; i < splitWork(tmp_len); i++){ + uint32_t idx = i*bs+tid; + if(idx < tmp_len){ + tmp[idx] = 0.0; + } + } + + uint32_t L_blocks = splitWork(p.L); + for(uint32_t L_block_id = 0; L_block_id < L_blocks; L_block_id++){ + if(L_block_id > 0){ + barrier(); + // Shift values in tmp to the current processing window + for(int i = 0; i < splitWork(tmp_len); i++){ + uint32_t idx = i*bs+tid; + if(idx >= bs*p.s0 && idx < tmp_len){ + tmp[idx-bs*p.s0] = tmp[idx]; + tmp[idx] = 0.0; + }else if(idx >= p.K && idx < bs*p.s0){ + tmp[idx] = 0.0; + } + } + } + barrier(); + + // Save contributions of the block to tmp + uint32_t L_idx = L_block_id*bs + tid; + for(uint32_t K_idx = 0; K_idx < p.K; K_idx++){ + D_TYPE dp = 0.0; + for(uint32_t Cin_idx = 0; Cin_idx < p.Cin; Cin_idx++){ + A_TYPE elemKrn = data_a[K_idx + Cout_idx * p.nb01 + Cin_idx * p.nb02]; + if(L_idx < p.L){ + B_TYPE elemInp = data_b[L_idx + Cin_idx*p.nb11]; + dp = fma(elemKrn, elemInp, dp); + } + } + tmp[tid*p.s0 + K_idx] += dp; + barrier(); + } + + // Save the computed values except the last block that can have different size + uint32_t KLb_idx = L_block_id*bs*p.s0; + if(L_block_id < L_blocks-1){ + for(uint32_t s0_idx = 0; s0_idx < p.s0; s0_idx++){ + uint32_t sh_idx = p.s0*tid+s0_idx; + uint32_t KL_idx = KLb_idx+sh_idx; + if(KL_idx < p.KL){ + data_d[KL_idx + Cout_idx*p.nb1] = tmp[sh_idx]; + } + } + } + } + + for(uint32_t i = 0; i < splitWork(tmp_len); i++){ + uint32_t idx = i*bs+tid; + uint32_t KL_idx = (L_blocks-1)*bs*p.s0+idx; + if(KL_idx < p.KL){ + data_d[KL_idx + Cout_idx*p.nb1] = tmp[idx]; + } + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 9361e2ac83b0f..c63345ec8b4b6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -622,6 +622,8 @@ void process_shaders() { string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index fb0d379dc8d68..ee605977f3a2c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -61,9 +61,6 @@ #define m512i(p) (__m512i)(p) #endif -// precomputed f32 table for f16 (256 KB) (ggml-impl.h) -float ggml_table_f32_f16[1 << 16]; - #if defined(__linux__) || \ defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH) @@ -133,7 +130,7 @@ static void ggml_print_backtrace_symbols(void) { } #endif -static void ggml_print_backtrace(void) { +void ggml_print_backtrace(void) { const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE"); if (GGML_NO_BACKTRACE) { return; @@ -160,6 +157,10 @@ static void ggml_print_backtrace(void) { const int parent_pid = getpid(); const int child_pid = fork(); if (child_pid < 0) { // error +#if defined(__linux__) + close(lock[1]); + close(lock[0]); +#endif return; } else if (child_pid == 0) { // child char attach[32]; @@ -167,6 +168,7 @@ static void ggml_print_backtrace(void) { #if defined(__linux__) close(lock[1]); (void) !read(lock[0], lock, 1); + close(lock[0]); #endif // try gdb execlp("gdb", "gdb", "--batch", @@ -195,7 +197,7 @@ static void ggml_print_backtrace(void) { } } #else -static void ggml_print_backtrace(void) { +void ggml_print_backtrace(void) { // platform not supported } #endif @@ -216,6 +218,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { abort(); } +// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp + // // logging // @@ -881,12 +885,6 @@ struct ggml_context { struct ggml_object * objects_end; }; -struct ggml_context_container { - bool used; - - struct ggml_context context; -}; - // // data types // @@ -954,6 +952,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "UPSCALE", "PAD", "PAD_REFLECT_1D", + "ROLL", "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", @@ -984,7 +983,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1049,6 +1048,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "upscale(x)", "pad(x)", "pad_reflect_1d(x)", + "roll(x)", "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", @@ -1079,7 +1079,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1419,14 +1419,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { // initialize time system (required on Windows) ggml_time_init(); - for (int i = 0; i < (1 << 16); ++i) { - union { - uint16_t u16; - ggml_fp16_t fp16; - } u = {i}; - ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); - } - is_first_call = false; } @@ -4340,6 +4332,34 @@ struct ggml_tensor * ggml_pad_reflect_1d( return result; } +// ggml_roll + +struct ggml_tensor * ggml_roll( + struct ggml_context * ctx, + struct ggml_tensor * a, + int shift0, + int shift1, + int shift2, + int shift3) { + GGML_ASSERT(a->nb[0] == ggml_type_size(a->type)); + GGML_ASSERT(abs(shift0) < a->ne[0]); + GGML_ASSERT(abs(shift1) < a->ne[1]); + GGML_ASSERT(abs(shift2) < a->ne[2]); + GGML_ASSERT(abs(shift3) < a->ne[3]); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + ggml_set_op_params_i32(result, 0, shift0); + ggml_set_op_params_i32(result, 1, shift1); + ggml_set_op_params_i32(result, 2, shift2); + ggml_set_op_params_i32(result, 3, shift3); + + result->op = GGML_OP_ROLL; + result->src[0] = a; + + return result; +} + // ggml_arange struct ggml_tensor * ggml_arange( diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp new file mode 100644 index 0000000000000..0d388d45536d1 --- /dev/null +++ b/ggml/src/ggml.cpp @@ -0,0 +1,26 @@ +#include "ggml-impl.h" + +#include +#include + +static std::terminate_handler previous_terminate_handler; + +GGML_NORETURN static void ggml_uncaught_exception() { + ggml_print_backtrace(); + if (previous_terminate_handler) { + previous_terminate_handler(); + } + abort(); // unreachable unless previous_terminate_handler was nullptr +} + +static bool ggml_uncaught_exception_init = []{ + const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE"); + if (GGML_NO_BACKTRACE) { + return false; + } + const auto prev{std::get_terminate()}; + GGML_ASSERT(prev != ggml_uncaught_exception); + previous_terminate_handler = prev; + std::set_terminate(ggml_uncaught_exception); + return true; +}(); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 8667a80bd0685..5ffd12b8b2795 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -335,7 +335,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par for (uint32_t i = 0; i < magic.size(); i++) { if (magic[i] != GGUF_MAGIC[i]) { - GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]); + char c0 = isprint(magic[0]) ? magic[0] : '?'; + char c1 = isprint(magic[1]) ? magic[1] : '?'; + char c2 = isprint(magic[2]) ? magic[2] : '?'; + char c3 = isprint(magic[3]) ? magic[3] : '?'; + GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3); gguf_free(ctx); return nullptr; } @@ -347,11 +351,28 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par int64_t n_tensors = 0; if (ok && gr.read(ctx->version)) { - if (ctx->version == 1) { + if (ok && ctx->version == 0) { + GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version); + ok = false; + } + + /* + * bit layout is different when reading non-native endian models. + * assuming that the GGUF version is 3, the non-native endian model + * would read it as 0x30000000. we can use the AND operation against + * the last 4 hexadecimal digits to check if the model is the same + * endianness as the host system. + */ + if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) { + GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version); + ok = false; + } + + if (ok && ctx->version == 1) { GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__); ok = false; } - if (ctx->version > GGUF_VERSION) { + if (ok && ctx->version > GGUF_VERSION) { GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n", __func__, ctx->version, GGUF_VERSION); ok = false; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3ee2b2064e1b4..fb75143b0b545 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -118,6 +118,10 @@ class LLM: EMBEDDING_SCALE = "{arch}.embedding_scale" TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step" + ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale" + ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" + ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" + EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -142,6 +146,8 @@ class Attention: SCALE = "{arch}.attention.scale" KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" + SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" + SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -198,6 +204,7 @@ class Tokenizer: MASK_ID = "tokenizer.ggml.mask_token_id" ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_SEP = "tokenizer.ggml.add_sep_token" ADD_PREFIX = "tokenizer.ggml.add_space_prefix" REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap" @@ -291,6 +298,7 @@ class MODEL_ARCH(IntEnum): BERT = auto() NOMIC_BERT = auto() NOMIC_BERT_MOE = auto() + NEO_BERT = auto() JINA_BERT_V2 = auto() BLOOM = auto() STABLELM = auto() @@ -312,6 +320,7 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() GEMMA2 = auto() GEMMA3 = auto() + GEMMA3N = auto() STARCODER2 = auto() RWKV6 = auto() RWKV6QWEN2 = auto() @@ -343,6 +352,8 @@ class MODEL_ARCH(IntEnum): WAVTOKENIZER_DEC = auto() PLM = auto() BAILINGMOE = auto() + DOTS1 = auto() + ARCEE = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -395,6 +406,22 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() + PER_LAYER_TOKEN_EMBD = auto() # gemma3n + PER_LAYER_MODEL_PROJ = auto() # gemma3n + PER_LAYER_INP_GATE = auto() # gemma3n + PER_LAYER_PROJ = auto() # gemma3n + PER_LAYER_PROJ_NORM = auto() # gemma3n + PER_LAYER_POST_NORM = auto() # gemma3n + ALTUP_PROJ = auto() # gemma3n + ALTUP_UNEMBD_PROJ = auto() # gemma3n + ALTUP_CORRECT_COEF = auto() # gemma3n + ALTUP_CORRECT_SCALE = auto() # gemma3n + ALTUP_PREDICT_COEF = auto() # gemma3n + ALTUP_ROUTER = auto() # gemma3n + ALTUP_ROUTER_NORM = auto() # gemma3n + LAUREL_L = auto() # gemma3n + LAUREL_R = auto() # gemma3n + LAUREL_POST_NORM = auto() # gemma3n SSM_IN = auto() SSM_CONV1D = auto() SSM_X = auto() @@ -571,6 +598,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.BERT: "bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", + MODEL_ARCH.NEO_BERT: "neo-bert", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", @@ -592,6 +620,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.GEMMA3: "gemma3", + MODEL_ARCH.GEMMA3N: "gemma3n", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", @@ -623,6 +652,8 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", MODEL_ARCH.PLM: "plm", MODEL_ARCH.BAILINGMOE: "bailingmoe", + MODEL_ARCH.DOTS1: "dots1", + MODEL_ARCH.ARCEE: "arcee", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -675,6 +706,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n + MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n + MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n + MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n + MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n + MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n + MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n + MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n + MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n + MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n + MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n + MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n + MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n + MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n + MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n + MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", @@ -1077,6 +1124,18 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.NEO_BERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ENC_OUTPUT_NORM, + MODEL_TENSOR.CLS, + MODEL_TENSOR.CLS_OUT, + ], MODEL_ARCH.JINA_BERT_V2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, @@ -1467,6 +1526,41 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_PRE_NORM, MODEL_TENSOR.FFN_POST_NORM, ], + MODEL_ARCH.GEMMA3N: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + # altup / laurel + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, + MODEL_TENSOR.PER_LAYER_MODEL_PROJ, + MODEL_TENSOR.PER_LAYER_INP_GATE, + MODEL_TENSOR.PER_LAYER_PROJ, + MODEL_TENSOR.PER_LAYER_PROJ_NORM, + MODEL_TENSOR.PER_LAYER_POST_NORM, + MODEL_TENSOR.ALTUP_PROJ, + MODEL_TENSOR.ALTUP_UNEMBD_PROJ, + MODEL_TENSOR.ALTUP_CORRECT_COEF, + MODEL_TENSOR.ALTUP_CORRECT_SCALE, + MODEL_TENSOR.ALTUP_PREDICT_COEF, + MODEL_TENSOR.ALTUP_ROUTER, + MODEL_TENSOR.ALTUP_ROUTER_NORM, + MODEL_TENSOR.LAUREL_L, + MODEL_TENSOR.LAUREL_R, + MODEL_TENSOR.LAUREL_POST_NORM, + ], MODEL_ARCH.STARCODER2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -2044,6 +2138,45 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], + MODEL_ARCH.DOTS1: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_EXP_PROBS_B, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], + MODEL_ARCH.ARCEE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index de6e45ae827b9..d32cd479adb17 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -271,7 +271,7 @@ def write_ti_data_to_file(self) -> None: def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None: if any(key in kv_data for kv_data in self.kv_data): - raise ValueError(f'Duplicated key name {key!r}') + logger.warning(f'Duplicated key name {key!r}, overwriting it with new value {val!r} of type {vtype.name}') self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type) @@ -672,6 +672,18 @@ def add_parallel_residual(self, use: bool) -> None: def add_decoder_start_token_id(self, id: int) -> None: self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id) + def add_embedding_length_per_layer_input(self, value: int) -> None: + self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value) + + def add_altup_active_idx(self, val: int) -> None: + self.add_uint32(Keys.LLM.ALTUP_ACTIVE_IDX.format(arch=self.arch), val) + + def add_altup_num_inputs(self, val: int) -> None: + self.add_uint32(Keys.LLM.ALTUP_NUM_INPUTS.format(arch=self.arch), val) + + def add_activation_sparsity_scale(self, values: Sequence[float]) -> None: + self.add_array(Keys.LLM.ACTIVATION_SPARSITY_SCALE.format(arch=self.arch), values) + def add_head_count(self, count: int | Sequence[int]) -> None: if isinstance(count, int): self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) @@ -702,6 +714,12 @@ def add_max_alibi_bias(self, bias: float) -> None: def add_clamp_kqv(self, value: float) -> None: self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + def add_shared_kv_layers(self, value: float) -> None: + self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value) + + def add_sliding_window_pattern(self, value: Sequence[bool]) -> None: + self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value) + def add_logit_scale(self, value: float) -> None: self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) @@ -891,6 +909,9 @@ def add_add_bos_token(self, value: bool) -> None: def add_add_eos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_EOS, value) + def add_add_sep_token(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_SEP, value) + def add_add_space_prefix(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) @@ -935,6 +956,9 @@ def add_eot_token_id(self, id: int) -> None: def add_eom_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOM_ID, id) + def add_classifier_output_labels(self, labels: Sequence[str]) -> None: + self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels) + # for vision models def add_clip_has_vision_encoder(self, value: bool) -> None: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 93dd1d8028f3d..b30f77dbe3be7 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -31,6 +31,7 @@ class TensorNameMap: "model.embeddings", # rwkv7 "model.word_embeddings", # bailingmoe "language_model.model.embed_tokens", # llama4 + "encoder", # neobert ), # Token type embeddings @@ -134,6 +135,7 @@ class TensorNameMap: "rwkv.blocks.{bid}.ln1", # rwkv6 "model.layers.{bid}.ln1", # rwkv7 "model.layers.{bid}.input_layernorm", # llama4 + "transformer_encoder.{bid}.attention_norm", # neobert ), # Attention norm 2 @@ -161,6 +163,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.qkv_proj", # phi3 "encoder.layers.{bid}.self_attention.query_key_value", # chatglm "transformer.layers.{bid}.attn.qkv_proj", # openelm + "transformer_encoder.{bid}.qkv", # neobert ), # Attention query @@ -236,6 +239,7 @@ class TensorNameMap: "transformer.layers.{bid}.attn.out_proj", # openelm "transformer.h.{bid}.attn.attention.out_proj", # exaone "model.layers.{bid}.self_attn.o_proj", # llama4 + "transformer_encoder.{bid}.wo", # neobert ), # Attention output norm @@ -276,6 +280,7 @@ class TensorNameMap: "encoder.layers.{bid}.post_attention_layernorm", # chatglm "transformer.layers.{bid}.ffn_norm", # openelm "model.layers.{bid}.post_attention_layernorm", # llama4 + "transformer_encoder.{bid}.ffn_norm", # neobert ), # Post feed-forward norm @@ -305,7 +310,7 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_EXP_PROBS_B: ( - "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 + "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1 ), # Feed-forward up @@ -333,11 +338,14 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc11", # nomic-bert "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe "model.layers.{bid}.mlp.c_fc", # starcoder2 - "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 + "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used) + "encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU) + "encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU) "model.layers.{bid}.residual_mlp.w3", # arctic "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "transformer.h.{bid}.mlp.c_fc_1", # exaone "model.layers.{bid}.feed_forward.up_proj", # llama4 + "transformer_encoder.{bid}.ffn.w12", # neobert ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -370,7 +378,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.{bid}.feed_forward.w1", # internlm2 "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) "transformer.h.{bid}.mlp.linear_1", # refact "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone @@ -420,6 +428,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 + "transformer_encoder.{bid}.ffn.w3", # neobert ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -471,6 +480,70 @@ class TensorNameMap: "encoder.layer.{bid}.layer_norm_2" # jina-v2-code ), + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: ( + "model.embed_tokens_per_layer", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_MODEL_PROJ: ( + "model.per_layer_model_projection", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_PROJ_NORM: ( + "model.per_layer_projection_norm", # gemma3n + ), + + MODEL_TENSOR.ALTUP_PROJ: ( + "model.altup_projections", # gemma3n + ), + + MODEL_TENSOR.ALTUP_UNEMBD_PROJ: ( + "model.altup_unembed_projections", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_INP_GATE: ( + "model.layers.{bid}.per_layer_input_gate", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_PROJ: ( + "model.layers.{bid}.per_layer_projection", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_POST_NORM: ( + "model.layers.{bid}.post_per_layer_input_norm", # gemma3n + ), + + MODEL_TENSOR.ALTUP_CORRECT_COEF: ( + "model.layers.{bid}.altup.correction_coefs", # gemma3n + ), + + MODEL_TENSOR.ALTUP_CORRECT_SCALE: ( + "model.layers.{bid}.altup.correct_output_scale", # gemma3n + ), + + MODEL_TENSOR.ALTUP_PREDICT_COEF: ( + "model.layers.{bid}.altup.prediction_coefs", # gemma3n + ), + + MODEL_TENSOR.ALTUP_ROUTER: ( + "model.layers.{bid}.altup.modality_router", # gemma3n + ), + + MODEL_TENSOR.ALTUP_ROUTER_NORM: ( + "model.layers.{bid}.altup.router_norm", # gemma3n + ), + + MODEL_TENSOR.LAUREL_L: ( + "model.layers.{bid}.laurel.linear_left", # gemma3n + ), + + MODEL_TENSOR.LAUREL_R: ( + "model.layers.{bid}.laurel.linear_right", # gemma3n + ), + + MODEL_TENSOR.LAUREL_POST_NORM: ( + "model.layers.{bid}.laurel.post_laurel_norm", # gemma3n + ), + MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", @@ -830,12 +903,14 @@ class TensorNameMap: # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( "encoder.final_layer_norm", # t5 + "layer_norm", # neobert ), MODEL_TENSOR.CLS: ( "classifier", # jina "classifier.dense", # roberta "pre_classifier", # distillbert + "dense", # neobert ), MODEL_TENSOR.CLS_OUT: ( diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index cca0979862a71..3f541b0c02e52 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -7,7 +7,10 @@ from pathlib import Path from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable -from sentencepiece import SentencePieceProcessor +try: + from sentencepiece import SentencePieceProcessor +except ImportError: + SentencePieceProcessor = None import gguf @@ -116,6 +119,7 @@ def _set_special_token(self, typ: str, tid: Any) -> None: logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') def _try_load_from_tokenizer_json(self, path: Path) -> bool: + tokenizer = None tokenizer_file = path / 'tokenizer.json' if tokenizer_file.is_file(): with open(tokenizer_file, encoding = 'utf-8') as f: @@ -149,11 +153,97 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {} + tokenizer_config = None tokenizer_config_file = path / 'tokenizer_config.json' - if not tokenizer_config_file.is_file(): + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, encoding = 'utf-8') as f: + tokenizer_config = json.load(f) + if tokenizer: + special_bos = (tokenizer_config or {}).get('bos_token') + special_cls = (tokenizer_config or {}).get('cls_token') + special_eos = (tokenizer_config or {}).get('eos_token') + special_sep = (tokenizer_config or {}).get('sep_token') + if not special_bos and special_cls and tokenizer_config: + tokenizer_config['bos_token'] = special_bos = special_cls + if not special_eos and special_sep and tokenizer_config: + tokenizer_config['eos_token'] = special_eos = special_sep + if post_processor := tokenizer.get('post_processor'): + for processor in post_processor.get('processors', [post_processor]): + if processor.get('type') == 'RobertaProcessing': + self.add_special_token['bos'] = True + self.add_special_token['eos'] = True + self.add_special_token['sep'] = True + if not special_cls and tokenizer_config: + special_cls = processor.get('cls', [special_bos])[0] + tokenizer_config['cls_token'] = special_cls + if not special_sep and tokenizer_config: + special_sep = processor.get('sep', [special_eos])[0] + tokenizer_config['sep_token'] = special_sep + continue + # Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added + # Only works with simple templates, **will** get it wrong on unusual sequences + if processor.get('type') == 'TemplateProcessing': + tmpl_single = processor.get('single', []) + tmpl_pair = processor.get('pair', []) + special_first = None + special_last = None + if len(tmpl_single) > 1: + if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'): + if not tokenizer_config: + special_bos = special_first + self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False + if special_first not in (special_bos, special_cls): + logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing') + if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'): + if not tokenizer_config: + special_eos = special_last + elif special_last != special_eos: + if 'eot' not in self.special_token_types: + self.special_token_types = tuple(self.special_token_types) + ('eot', ) + tokenizer_config['eot_token'] = special_eos + elif 'eom' not in self.special_token_types: + self.special_token_types = tuple(self.special_token_types) + ('eom', ) + tokenizer_config['eom_token'] = special_eos + else: + logger.warning(f'Overriding EOS token {special_eos!r} with {special_last!r} without EOT/EOM fallback!') + tokenizer_config['eos_token'] = special_eos = special_last + self.add_special_token['eos'] = True if special_last == special_eos else False + if special_last != special_eos: + logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing') + if tmpl_pair: + seq_start = 1 if special_first and tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0 + seq_stop = -1 if special_last and tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None + if (special_first and seq_start == 0) or (special_last and seq_stop is None): + logger.warning('TemplateProcessing leading/trailing special tokens do not match TemplateProcessing') + if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]: + tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id') + tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id') + if tmpl_a != 'A' or tmpl_b != 'B': + logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing') + # A [sep] [eos] B + if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]): + add_sep = False + if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'): + if special_entry in (special_sep, special_eos) and not special_last: + add_sep = True + if special_entry not in (special_sep, special_eos): + logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing') + else: + logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing') + if len(tmpl_pair) == 2: + if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'): + if special_entry in (special_sep, special_eos): + add_sep = True + if special_entry not in (special_sep, special_eos): + logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing') + else: + logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing') + self.add_special_token['sep'] = add_sep + if add_sep and not special_sep and tokenizer_config: + tokenizer_config['sep_token'] = special_eos + continue + if not tokenizer_config: return True - with open(tokenizer_config_file, encoding = 'utf-8') as f: - tokenizer_config = json.load(f) chat_template_alt = None chat_template_file = path / 'chat_template.json' if chat_template_file.is_file(): @@ -302,6 +392,9 @@ class SentencePieceVocab(Vocab): name = "spm" def __init__(self, base_path: Path): + if SentencePieceProcessor is None: + raise RuntimeError("sentencepiece is not installed") + added_tokens: dict[str, int] = {} if (fname_tokenizer := base_path / 'tokenizer.model').exists(): # normal location diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index f11351cba1767..0f3a1eeee8304 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.17.0" +version = "0.17.1" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ @@ -22,7 +22,7 @@ python = ">=3.8" numpy = ">=1.17" tqdm = ">=4.27" pyyaml = ">=5.1" -sentencepiece = ">=0.1.98,<=0.2.0" +sentencepiece = { version = ">=0.1.98,<=0.2.0", optional = true } PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true } [tool.poetry.dev-dependencies] diff --git a/include/llama.h b/include/llama.h index da0f652cfd63a..3eda9bc68608c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -61,7 +61,10 @@ extern "C" { struct llama_model; struct llama_context; struct llama_sampler; - struct llama_kv_cache; + + typedef struct llama_memory_i * llama_memory_t; + + struct llama_kv_cache; // DEPRECATED (use llama_memory instead) typedef int32_t llama_pos; typedef int32_t llama_token; @@ -240,18 +243,21 @@ extern "C" { typedef bool (*llama_progress_callback)(float progress, void * user_data); - // Input data for llama_decode + // Input data for llama_encode/llama_decode // A llama_batch object can contain input about one or many sequences // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens // // - token : the token ids of the input (used when embd is NULL) // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) // - pos : the positions of the respective token in the sequence - // (if set to NULL, the token position will be tracked automatically by llama_decode) + // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode) // - seq_id : the sequence to which the respective token belongs // (if set to NULL, the sequence ID will be assumed to be 0) // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output - // (if set to NULL, only the logits for last token will be returned) + // (if set to NULL: + // - if embeddings: all tokens are output + // - if not: only the last token is output + // ) // typedef struct llama_batch { int32_t n_tokens; @@ -259,8 +265,8 @@ extern "C" { llama_token * token; float * embd; llama_pos * pos; - int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence - llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id; + int32_t * n_seq_id; + llama_seq_id ** seq_id; int8_t * logits; // TODO: rename this to "output" } llama_batch; @@ -384,6 +390,7 @@ extern "C" { void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides void * tensor_types; // pointer to vector containing tensor types + void * prune_layers; // pointer to vector containing layer indices to prune } llama_model_quantize_params; typedef struct llama_logit_bias { @@ -493,9 +500,11 @@ extern "C" { DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); - LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx); + LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type + DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); + LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); @@ -509,6 +518,13 @@ extern "C" { // Get the model's RoPE frequency scaling factor LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); + // Returns the number of classifier outputs (only valid for classifier models) + // Undefined behavior for non-classifier models + LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model); + + // Returns label of classifier output by index ( 1` + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) + LLAMA_API void llama_memory_seq_div( + llama_memory_t mem, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); + + // Returns the smallest position present in the memory for the specified sequence + // This is typically non-zero only for SWA caches + // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory + // Return -1 if the sequence is empty + LLAMA_API llama_pos llama_memory_seq_pos_min( + llama_memory_t mem, + llama_seq_id seq_id); + + // Returns the largest position present in the memory for the specified sequence + // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory + // Return -1 if the sequence is empty + LLAMA_API llama_pos llama_memory_seq_pos_max( + llama_memory_t mem, + llama_seq_id seq_id); + + // Check if the memory supports shifting + LLAMA_API bool llama_memory_can_shift(llama_memory_t mem); + + // + // KV cache for self-attention (TODO: deprecate in favor of llama_memory) // // Returns the number of tokens in the KV cache (slow, use only for debug) @@ -622,86 +712,95 @@ extern "C" { "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); // Clear the KV cache - both cell info is erased and KV data is zeroed - LLAMA_API void llama_kv_self_clear( - struct llama_context * ctx); + DEPRECATED(LLAMA_API void llama_kv_self_clear( + struct llama_context * ctx), + "Use llama_memory_clear() instead"); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails // seq_id < 0 : match any sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API bool llama_kv_self_seq_rm( + DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm( struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, - llama_pos p1); + llama_pos p1), + "Use llama_memory_seq_rm() instead"); // Copy all tokens that belong to the specified sequence to another sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_self_seq_cp( + DEPRECATED(LLAMA_API void llama_kv_self_seq_cp( struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, - llama_pos p1); + llama_pos p1), + "Use llama_memory_seq_cp() instead"); // Removes all tokens that do not belong to the specified sequence - LLAMA_API void llama_kv_self_seq_keep( + DEPRECATED(LLAMA_API void llama_kv_self_seq_keep( struct llama_context * ctx, - llama_seq_id seq_id); + llama_seq_id seq_id), + "Use llama_memory_seq_keep() instead"); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_self_seq_add( + DEPRECATED(LLAMA_API void llama_kv_self_seq_add( struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, - llama_pos delta); + llama_pos delta), + "Use llama_memory_seq_add() instead"); // Integer division of the positions by factor of `d > 1` // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_self_seq_div( + DEPRECATED(void llama_kv_self_seq_div( struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, - int d); + int d), + "Use llama_memory_seq_div() instead"); // Returns the smallest position present in the KV cache for the specified sequence // This is typically non-zero only for SWA caches // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache // Return -1 if the sequence is empty - LLAMA_API llama_pos llama_kv_self_seq_pos_min( + DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min( struct llama_context * ctx, - llama_seq_id seq_id); + llama_seq_id seq_id), + "Use llama_memory_seq_pos_min() instead"); // Returns the largest position present in the KV cache for the specified sequence // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache // Return -1 if the sequence is empty - LLAMA_API llama_pos llama_kv_self_seq_pos_max( + DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max( struct llama_context * ctx, - llama_seq_id seq_id); + llama_seq_id seq_id), + "Use llama_memory_seq_pos_max() instead"); // Defragment the KV cache // This will be applied: // - lazily on next llama_decode() - LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx), + DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); // Check if the context supports KV cache shifting - LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx); + DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx), + "use llama_memory_can_shift() instead"); // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) - LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx), + DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), "simply remove this call, updates are applied lazily on the next llama_decode()"); // @@ -709,7 +808,7 @@ extern "C" { // // Returns the *actual* size in bytes of the state - // (logits, embedding and kv_cache) + // (logits, embedding and memory) // Only use when saving the state, not when restoring it, otherwise the size may be too small. LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx), @@ -765,12 +864,12 @@ extern "C" { size_t n_token_count), "use llama_state_save_file instead"); - // Get the exact size needed to copy the KV cache of a single sequence + // Get the exact size needed to copy the state of a single sequence LLAMA_API size_t llama_state_seq_get_size( struct llama_context * ctx, llama_seq_id seq_id); - // Copy the KV cache of a single sequence into the specified buffer + // Copy the state of a single sequence into the specified buffer LLAMA_API size_t llama_state_seq_get_data( struct llama_context * ctx, uint8_t * dst, @@ -836,21 +935,23 @@ extern "C" { // For encode-decoder contexts, processes the batch using the encoder. // Can store the encoder output internally for later use by the decoder's cross-attention layers. // 0 - success - // < 0 - error. the KV cache state is restored to the state before this call + // < 0 - error. the memory state is restored to the state before this call LLAMA_API int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch); // Process a batch of tokens. - // Requires KV cache. + // Requires the context to have a memory. // For encode-decoder contexts, processes the batch using the decoder. // Positive return values does not mean a fatal error, but rather a warning. - // Upon non-zero return values, the KV cache state is restored to the state before this call + // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context + // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max() + // Upon other return values, the memory state is restored to the state before this call // 0 - success // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) - // 2 - aborted + // 2 - aborted (processed ubatches will remain in the context's memory) // -1 - invalid input batch - // < -1 - error + // < -1 - fatal error (processed ubatches will remain in the context's memory) LLAMA_API int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch); @@ -866,8 +967,8 @@ extern "C" { // Get the number of threads used for prompt and batch processing (multiple token). LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx); - // Set whether the model is in embeddings mode or not - // If true, embeddings will be returned but logits will not + // Set whether the context outputs embeddings or not + // TODO: rename to avoid confusion with llama_get_embeddings() LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); // Set whether to use causal attention or not @@ -916,7 +1017,7 @@ extern "C" { // Get the embeddings for a sequence id // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE - // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence + // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence // otherwise: float[n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); @@ -946,6 +1047,7 @@ extern "C" { LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab); LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab); + LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab); LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab); LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab); @@ -989,6 +1091,7 @@ extern "C" { /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. /// @return Returns the number of tokens on success, no more than n_tokens_max /// @return Returns a negative number on failure - the number of tokens that would have been returned + /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit) /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so. /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated /// as plaintext. Does not insert a leading space. diff --git a/models/t5-very-small-random-F32.gguf b/models/t5-very-small-random-F32.gguf new file mode 100644 index 0000000000000..fd386d88562d2 Binary files /dev/null and b/models/t5-very-small-random-F32.gguf differ diff --git a/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja b/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja new file mode 100644 index 0000000000000..19a3eaee49be6 --- /dev/null +++ b/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja @@ -0,0 +1,124 @@ +{%- set today = strftime_now("%Y-%m-%d") %} +{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information or when the user's request requires up-to-date or specific data, you must use the available tools to fetch the information. Do not hesitate to use tools whenever they can provide a more accurate or complete response. If no relevant tools are available, then clearly state that you don't have the information and avoid making up anything. + +If the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\"). +You are always very attentive to dates, and when asked about information at specific dates, you discard information that is at another date. +You follow these instructions in all languages, and always respond to the user in the language they use or request. +Next sections describe the capabilities that you have. + +# WEB BROWSING INSTRUCTIONS + +You cannot perform any web search or access internet to open URLs, links etc. If it seems like the user is expecting you to do so, you clarify the situation and ask the user to copy paste the text directly in the chat. + +# MULTI-MODAL INSTRUCTIONS + +You have the ability to read images, but you cannot generate images. You also cannot transcribe audio files or videos. +You cannot read nor transcribe audio files or videos. + +# TOOL CALLING INSTRUCTIONS + +You may have access to tools that you can use to fetch information or perform actions. You must use these tools in the following situations: + +1. When the request requires up-to-date information. +2. When the request requires specific data that you do not have in your knowledge base. +3. When the request involves actions that you cannot perform without tools. + +Always prioritize using tools to provide the most accurate and helpful response. If tools are not available, inform the user that you cannot perform the requested action at the moment." %} + +{{- bos_token }} + +{%- set system_prompt = default_system_message %} +{%- set loop_messages = messages %} + +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{%- if messages|length > 0 and messages[0]['role'] == 'system' %} + {%- if messages[0]['content'] is string %} + {%- set system_prompt = messages[0]['content'] %} + {%- else %} + {%- set system_prompt = messages[0]['content'][0]['text'] %} + {%- endif %} + {%- set loop_messages = messages[1:] %} +{%- endif %} + +{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %} + +{%- set ns = namespace(index=0) %} +{%- for message in loop_messages %} + {%- if not (message.role == "tool" or (message.get('tool_calls'))) %} + {%- if (message["role"] == "user") != (ns.index % 2 == 0) %} + {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }} + {%- endif %} + {%- set ns.index = ns.index + 1 %} + {%- endif %} +{%- endfor %} + +{{- '[SYSTEM_PROMPT]' + system_prompt + '[/SYSTEM_PROMPT]' }} + +{%- for message in loop_messages %} + {%- if message['role'] == 'system' %} + {%- if message['content'] is string %} + {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }} + {%- else %} + {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }} + {%- endif %} + {%- elif message['role'] == 'user' %} + {%- if tools is not none and (message == user_messages[-1]) %} + {{- '[AVAILABLE_TOOLS]' + tools|tojson + '[/AVAILABLE_TOOLS]' }} + {%- endif %} + {{- '[INST]' }} + {%- if message['content'] is string %} + {{- message['content'] }} + {%- else %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text and image blocks are supported in message content!') }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '[/INST]' }} + {%- elif message['role'] == 'assistant' %} + {%- if message.get('tool_calls') %} + {%- for tool_call in message.tool_calls %} + {{- '[TOOL_CALLS]' + tool_call.function.name }} + {%- if not tool_call.id is defined or tool_call.id is not string or tool_call.id|length != 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }} + {%- endif %} + {{- '[CALL_ID]' + tool_call.id }} + {{- '[ARGS]' + tool_call['function']['arguments']|tojson }} + {%- endfor %} + {{- eos_token }} + {%- elif message['content'] is string %} + {{- message['content'] + eos_token }} + {%- else %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text and image blocks are supported in assistant content!') }} + {%- endif %} + {%- endfor %} + {{- eos_token }} + {%- endif %} + {%- elif message['role'] == 'tool_results' or message['role'] == 'tool' %} + {%- if message.content is defined and message.content.content is defined %} + {%- set content = message.content.content %} + {%- else %} + {%- set content = message.content %} + {%- endif %} + {%- if not message.tool_call_id is defined or message.tool_call_id is not string or message['tool_call_id']|length != 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }} + {%- endif %} + {{- '[TOOL_RESULTS]' + message.tool_call_id + '[TOOL_CONTENT]' + content|string + '[/TOOL_RESULTS]' }} + {%- else %} + {{- raise_exception('Only system, user, assistant, and tool roles are supported!') }} + {%- endif %} +{%- endfor %} diff --git a/prebuilts/Hexagon_SDK/.lock b/prebuilts/Hexagon_SDK/.lock new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.h new file mode 100755 index 0000000000000..c8e297a6a4474 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.h @@ -0,0 +1,1412 @@ +/*----------------------------------------------------------------------------- + Copyright (c) 2019-2020-2022,2024 QUALCOMM Technologies, Incorporated. + All Rights Reserved. + QUALCOMM Proprietary. +-----------------------------------------------------------------------------*/ + +#ifndef HAP_COMPUTE_RES_H_ +#define HAP_COMPUTE_RES_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @defgroup types Macros and structures + * @{ + */ + +/** Error code for unsupported features. */ +#define HAP_COMPUTE_RES_NOT_SUPPORTED 0x80000404 +/** Maximum thread identifiers supported */ +#define HAP_COMPUTE_RES_MAX_NUM_THREADS 16 + +/** + * @file HAP_compute_res.h + * @brief Header file with APIs to allocate compute resources. + */ + +/** + * Structure containing attributes for compute resources. + */ +typedef struct { + unsigned long long attributes[8]; /**< Attribute array. */ +} compute_res_attr_t; + +/** + * Structure containing a VTCM page size and the number of pages with that size. + */ +typedef struct { + unsigned int page_size; /**< Page size in bytes. */ + unsigned int num_pages; /**< Number of pages of size page_size. */ +} compute_res_vtcm_page_def_t; + +/** + * Structure describing the VTCM memory pages. + */ +typedef struct { + unsigned int block_size; /**< Block size in bytes */ + unsigned int page_list_len; /**< Number of valid elements in page_list array */ + compute_res_vtcm_page_def_t page_list[8]; /**< Array of pages. */ +} compute_res_vtcm_page_t; + +/** + * enum of HMX lock tyes + */ +typedef enum { + HAP_COMPUTE_RES_HMX_NON_SHARED = 0, /**< No sharing of HMX across threads */ + HAP_COMPUTE_RES_HMX_SHARED = 1, /**< To share HMX across threads */ +} compute_res_hmx_type_t; + +/** + * enum of capabilities supported by capability query API + */ +typedef enum { + HAP_COMPUTE_RES_PREEMPTION_CAPABILITY = 1, /**< Preemption capability */ +} compute_res_capability_id; + +/** + * Masks returned by preemption capability query + */ +#define HAP_COMPUTE_RES_COOPERATIVE_PREEMPTION 1 +/**< Mask indicating support for cooperative preemption framework using + * capabilities query. The cooperative preemption framework involves applications + * registering a release callback for accepting yield requests from a high priority + * allocator. + */ +#define HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION 2 +/**< Mask indicating support for autonomous/optimized preemption framework using + * capabilities query. HMX resource management is moved out of #HAP_compute_res_acquire()/ + * #HAP_compute_res_acquire_cached(), instead applications use #HAP_compute_res_hmx_lock3()/ + * #HAP_compute_res_hmx_unlock3() to lock/unlock HMX resource directly from the threads + * using HMX. Applications shall implement HMX critical section using hmx_mutex object + * returned by #HAP_compute_res_hmx_lock3() around non-preemptable HMX sections. + */ +#define HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION 4 +/**< Mask indicating support for thread identifiers based autonomous/optimized + * preemption framework. This feature is a subset of HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION. + * In this feature, applications register the threads that + * will be working on the allocated compute resources with the resource manager. + * The compute resource manager, as part of autonomous preemption, suspends the + * threads associated with the low priority context when a high priority thread + * requests for these resources. + */ + + +/** + * enum of commands for providing thread ids to the resource manager + */ +typedef enum { + HAP_COMPUTE_RES_THREADS_OVERRIDE = 1, + /**< Command ID to override the thread list registered with a context */ + HAP_COMPUTE_RES_THREADS_APPEND = 2, + /**< Command ID to append to an existing thread list associated with + * the context + */ + HAP_COMPUTE_RES_THREADS_REMOVE = 3, + /**< Command ID to remove a thread from an existing thread list associated + * with the context + */ +} compute_res_threads_cmd_id; + +/** + * Structure holding HMX critical section parameters + */ +typedef struct { + void *mutex; + /**< Mutex to be used for entering/exiting HMX critical section + * via lock and unlock functions + */ + void (*lock)(void *mutex); + /**< Lock function to be called for entering HMX critical section using + * mutex as argument + */ + void (*unlock)(void *mutex); + /**< Unlock function to be called for exiting HMX critical section using + * mutex as argument + */ +} compute_res_hmx_mutex_t; + +/** + * Structure for querying preemption data + */ +typedef struct { + unsigned int num_preemptions; + /**< Number of preemptions on the acquired context */ + unsigned long long preempted_duration; + /**< Total duration the context remained preempted in terms of 19.2MHz ticks */ + unsigned long long preemption_overhead; + /**< Total preemption overhead in terms of 19.2MHz ticks */ +} compute_res_preempt_data_t; + +/** + * @} + */ + +/** + * @cond DEV + */ +int __attribute__((weak)) compute_resource_attr_init( + compute_res_attr_t* attr); + +int __attribute__((weak)) compute_resource_attr_set_serialize( + compute_res_attr_t* attr, + unsigned char b_enable); + +int __attribute__((weak)) compute_resource_attr_set_hmx_param( + compute_res_attr_t* attr, + unsigned char b_enable); + +int __attribute__((weak)) compute_resource_attr_set_vtcm_param( + compute_res_attr_t* attr, + unsigned int vtcm_size, + unsigned char b_single_page); + +int __attribute__((weak)) compute_resource_attr_set_vtcm_param_v2( + compute_res_attr_t* attr, + unsigned int vtcm_size, + unsigned int min_page_size, + unsigned int min_vtcm_size); + +int __attribute__((weak)) compute_resource_attr_set_app_type( + compute_res_attr_t* attr, + unsigned int application_id); + +int __attribute__((weak)) compute_resource_attr_set_cache_mode( + compute_res_attr_t* attr, + unsigned char b_enable); + +int __attribute__((weak)) compute_resource_attr_set_release_callback( + compute_res_attr_t* attr, + int (*release_callback)( + unsigned int context_id, + void* client_context), + void* client_context); + +void* __attribute__((weak)) compute_resource_attr_get_vtcm_ptr( + compute_res_attr_t* attr); + +int __attribute__((weak)) compute_resource_attr_get_vtcm_ptr_v2( + compute_res_attr_t* attr, + void** vtcm_ptr, + unsigned int* vtcm_size); + +int __attribute__((weak)) compute_resource_query_VTCM( + unsigned int application_id, + unsigned int* total_block_size, + compute_res_vtcm_page_t* total_block_layout, + unsigned int* avail_block_size, + compute_res_vtcm_page_t* avail_block_layout); + +unsigned int __attribute__((weak)) compute_resource_acquire( + compute_res_attr_t* attr, + unsigned int timeout_us); + +int __attribute__((weak)) compute_resource_release( + unsigned int context_id); + +int __attribute__((weak)) compute_resource_acquire_cached( + unsigned int context_id, + unsigned int timeout_us); + +int __attribute__((weak)) compute_resource_release_cached( + unsigned int context_id); + +int __attribute__((weak)) compute_resource_hmx_lock( + unsigned int context_id); + +int __attribute__((weak)) compute_resource_hmx_unlock( + unsigned int context_id); + +int __attribute__((weak)) compute_resource_check_release_request( + unsigned int context_id); + +int __attribute__((weak)) compute_resource_hmx_lock2( + unsigned int context_id, + compute_res_hmx_type_t type); + +int __attribute__((weak)) compute_resource_hmx_unlock2( + unsigned int context_id, + compute_res_hmx_type_t type); + +int __attribute__((weak)) compute_resource_update_priority( + unsigned int context_id, + unsigned short priority); + +int __attribute__((weak)) crm_hmx_lock3(unsigned int context_id, + compute_res_hmx_type_t type, + compute_res_hmx_mutex_t *mutex, + unsigned int timeout_us); + +int __attribute__((weak)) crm_hmx_unlock3(unsigned int context_id, + compute_res_hmx_type_t type, + compute_res_hmx_mutex_t *mutex); + +int __attribute__ ((weak)) crm_attr_set_vtcm_backup( + compute_res_attr_t* attr, + void *buffer, + unsigned int buffer_size); + +int __attribute__ ((weak)) crm_attr_set_threads( + compute_res_attr_t* attr, + unsigned int *threads, + unsigned int num_threads); + +int __attribute__ ((weak)) crm_attr_set_vtcm_clear_on_release( + compute_res_attr_t* attr, + unsigned char enable); + +int __attribute__ ((weak)) crm_cached_set_threads(compute_res_threads_cmd_id command, + unsigned int context_id, + unsigned int *threads, + unsigned int num_threads); + +int __attribute__((weak)) crm_query_capability(compute_res_capability_id capability_id, + unsigned int* data); + +int __attribute__((weak)) crm_get_preempt_data(unsigned int context_id, + compute_res_preempt_data_t *data); + +int __attribute__((weak)) crm_tid_preemption_lock(void); + +int __attribute__((weak)) crm_tid_preemption_unlock(void); +/** + * @endcond + */ + +/** + * @defgroup attributes Manage attributes + * Manage parameters affecting the requested shared resources + * @{ + */ + +/** + * Initializes the attribute structure for a resource request. + * + * The user must call this function before setting any specific resource property + * via other helper functions. + * + * @param[in] attr Pointer to compute resource attribute structure, + * #compute_res_attr_t. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED if unsupported. + */ +static inline int HAP_compute_res_attr_init(compute_res_attr_t* attr) +{ + if (compute_resource_attr_init) + return compute_resource_attr_init(attr); + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Sets or clears the serialization option in the request resource structure. + * + * Serialization allows participating use cases to run with mutually exclusive + * access to the entire cDSP which helps, for example, in avoiding cache + * thrashing while trying to run simultaneously on different hardware threads. + * Participating use cases issue blocking acquires on the serialization + * resource when ready to run, and each runs in turn when it is granted that + * resource. + * + * Acquiring the serialization resource only ensures + * mutual exclusion from other cooperating use cases that also block on + * acquisition of that resource, it does not guarantee exclusion from + * concurrent use cases that do not request the serialization + * resource. + * + * @param[in] attr Pointer to the compute resource attribute structure, + * #compute_res_attr_t. + * @param[in] b_serialize 1 (TRUE) to participate in serialization. \n + * 0 (FALSE) otherwise. + * + * @return + * 0 upon success \n + * Nonzero upon failure. + */ +static inline int HAP_compute_res_attr_set_serialize( + compute_res_attr_t* attr, + unsigned char b_serialize) +{ + if (compute_resource_attr_set_serialize) + { + return compute_resource_attr_set_serialize(attr, + b_serialize); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Sets VTCM request parameters in the provided resource attribute structure. + * + * The user calls this function to request the specified VTCM size in the acquire call. + * These VTCM request attributes are reset to 0 (no VTCM request) in the + * resource attribute structure by HAP_compute_res_attr_init(). + * + * @param[in] attr Pointer to compute resource attribute structure, + * #compute_res_attr_t. + * @param[in] vtcm_size Size of the VTCM request in bytes; + 0 if VTCM allocation is not required. + * @param[in] b_single_page 1 - Requested VTCM size to be allocated in a + * single page. \n + * 0 - No page requirement (allocation can spread + * across multiple pages. VTCM manager + * always attempts the best fit). + * + * @return + * 0 upon success. \n + * Non-zero upon failure. + */ +static inline int HAP_compute_res_attr_set_vtcm_param( + compute_res_attr_t* attr, + unsigned int vtcm_size, + unsigned char b_single_page) +{ + if (compute_resource_attr_set_vtcm_param) + { + return compute_resource_attr_set_vtcm_param(attr, + vtcm_size, + b_single_page); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Reads the VTCM memory pointer from the given attribute structure. + * + * On a successful VTCM resource request placed via #HAP_compute_res_acquire() + * using HAP_compute_res_attr_set_vtcm_param(), a user can invoke this helper + * function to retrieve the allocated VTCM address by passing the same attribute + * structure used in the respective HAP_compute_res_acquire() call. + * + * @param[in] attr Pointer to compute the resource attribute structure + * #compute_res_attr_t. + * + * @return + * Void pointer to the allocated VTCM section. \n + * 0 signifies no allocation. + */ +static inline void* HAP_compute_res_attr_get_vtcm_ptr(compute_res_attr_t* attr) +{ + if (compute_resource_attr_get_vtcm_ptr) + { + return compute_resource_attr_get_vtcm_ptr(attr); + } + + return 0; +} + +/** + * Sets an extended set of VTCM request parameters in the attribute structure, + * specifically VTCM Size, the minimum required page size, and the minimum + * required VTCM size. + * + * This function cannot be used with HAP_compute_res_attr_set_vtcm_param(). + * Call this function after HAP_compute_res_attr_init(). + * + * Supported starting with Lahaina. + * + * @param[in] attr Pointer to compute the resource attribute structure, + * #compute_res_attr_t. + * @param[in] vtcm_size Size of the VTCM request in bytes. 0 if VTCM allocation + * is NOT required. + * @param[in] min_page_size Minimum page size required in bytes. Valid pages include + * 4 KB, 16 KB, 64 KB, 256 KB, 1 MB, 4 MB, 16 MB. Setting 0 + * will select best possible fit (least page mappings) + * @param[in] min_vtcm_size Minimum VTCM size in bytes, if the specified size + * (vtcm_size) is not available. 0 means the + * size is an absolute requirement. + * + * @return + * 0 for success. \n + * Non-zero for failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_attr_set_vtcm_param_v2( + compute_res_attr_t* attr, + unsigned int vtcm_size, + unsigned int min_page_size, + unsigned int min_vtcm_size) +{ + if (compute_resource_attr_set_vtcm_param_v2) + { + return compute_resource_attr_set_vtcm_param_v2(attr, + vtcm_size, + min_page_size, + min_vtcm_size); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Sets VTCM backup buffer in the provided attribute structure. + * + * Compute resource manager uses the provided buffer to backup VTCM allocated + * to the user during preemption of the associated request/context. The backup + * buffer provided should be able to accomodate all of the requested VTCM size. + * VTCM backup buffer is essential for preemption to work on architectures + * supporting HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION (use + * HAP_compute_res_query_capability() to query preemption model supported) + * + * Call this function after HAP_compute_res_attr_init(). + * + * @param[in] attr Pointer to the compute resource attribute structure, + * #compute_res_attr_t. + * @param[in] buffer Pointer to the backup buffer in main memory (DDR). To be + * used by the compute resource manager for saving/restoring + * user allocated VTCM region during preemption. + * @param[in] buffer_size Size of the backup buffer in main memory (DDR) pointed + * to by the #buffer argument. The provided buffer should + * be sufficiently sized to accommodate user requested + * VTCM size. Align the buffer to 128B for better performance. + * + * @return + * 0 for success. \n + * Non-zero for failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_attr_set_vtcm_backup( + compute_res_attr_t* attr, + void *buffer, + unsigned int buffer_size) +{ + if (crm_attr_set_vtcm_backup) + { + return crm_attr_set_vtcm_backup(attr, buffer, buffer_size); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Updates provided attribute structure with user-provided thread id array. + * + * On architectures supporting HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION, + * Compute resource manager requires users to register the threads that will be + * using the compute resources requested via #HAP_compute_res_acquire(). + * + * Call this function after HAP_compute_res_attr_init(). + * + * @param[in] attr Pointer to the compute resource attribute structure, + * #compute_res_attr_t. + * @param[in] threads Pointer to an array of QuRT thread identifiers associated + * with the resource request. This array should be valid + * till #HAP_compute_res_acquire() is called on the prepared + * attribute. + * @param[in] num_threads Number of QuRT thread identifiers in the provided + * threads array #threads. A maximum of + * HAP_COMPUTE_RES_MAX_NUM_THREADS + * number of threads can be provided. + * + * @return + * 0 for success. \n + * Non-zero for failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_attr_set_threads( + compute_res_attr_t* attr, + unsigned int *threads, + unsigned int num_threads) +{ + if (crm_attr_set_threads) + { + return crm_attr_set_threads(attr, threads, num_threads); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Updates thread id array for the associated cached context. + * + * Compute resource manager uses the QuRT thread identifiers provided by the + * user during preemption of the associated context. For cached + * allocations, the thread identifiers can either be provided at the time + * of HAP_compute_res_acquire() call using #HAP_compute_res_attr_set_threads(), + * or using this API with the context_id returned by a successful + * #HAP_compute_res_acquire() call when the cached attribute is set via + * #HAP_compute_res_attr_set_cache_mode(). + * The API has to be called before HAP_compute_res_acquire_cached() call. + * + * @param[in] command specifies a command from compute_res_threads_cmd_id: + * HAP_COMPUTE_RES_THREADS_OVERRIDE : To provide a new + * set of threads. + * HAP_COMPUTE_RES_THREADS_APPEND : To append to previously + * provided list of threads. + * HAP_COMPUTE_RES_THREADS_REMOVE : To remove given threads + * from previoulsy provided + * list of threads. + * @param[in] context_id Context ID returned by HAP_compute_res_acquire(). + * @param[in] threads Pointer to an array of QuRT thread identifiers associated + * with the resource request. + * @param[in] num_threads Number of QuRT thread identifiers in the provided + * threads array #threads. A maximum of + * HAP_COMPUTE_RES_MAX_NUM_THREADS + * number of threads can be provided. + * + * @return + * 0 for success. \n + * Non-zero for failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_cached_set_threads(compute_res_threads_cmd_id command, + unsigned int context_id, + unsigned int *threads, + unsigned int num_threads) +{ + if (crm_cached_set_threads) + { + return crm_cached_set_threads(command, context_id, threads, num_threads); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Sets VTCM clear on release option in the provided attribute structure. + * + * The compute resource manager by default initializes the VTCM memory to 0 + * when VTCM is released by the caller either at the time of release or when + * it's allocated to another process. For performance considerations (also + * considering security implications if any), client can intimate the compute + * resource manager not to clear out (zero-initialize) the allocated VTCM + * on release. + * + * Call this function after HAP_compute_res_attr_init(). + * + * @param[in] attr Pointer to the compute resource attribute structure, + * #compute_res_attr_t. + * @param[in] enable 1 - zero-initialize VTCM memory after release (default) + * 0 - Do not zero-initialize VTCM memory after release. + * @return + * 0 for success. \n + * Non-zero for failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_attr_set_vtcm_clear_on_release( + compute_res_attr_t* attr, + unsigned char enable) +{ + if (crm_attr_set_vtcm_clear_on_release) + { + return crm_attr_set_vtcm_clear_on_release(attr, enable); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * On a successful VTCM resource request placed via + * HAP_compute_res_acquire() or HAP_compute_res_acquire_cached() using + * HAP_compute_res_attr_set_vtcm_param_v2(), users invoke this helper function + * to retrieve the allocated VTCM address and size by passing the same + * attribute structure used in the respective acquire call. + * + * Supported starting with Lahaina. + * + * @param[in] attr Pointer to compute the resource attribute structure + * #compute_res_attr_t. + * @param[out] vtcm_ptr Assigned VTCM address; NULL for no allocation. + * @param[out] vtcm_size Size of the allocated VTCM memory from the assigned pointer. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_attr_get_vtcm_ptr_v2( + compute_res_attr_t* attr, + void** vtcm_ptr, + unsigned int* vtcm_size) +{ + if (compute_resource_attr_get_vtcm_ptr_v2) + { + return compute_resource_attr_get_vtcm_ptr_v2(attr, + vtcm_ptr, + vtcm_size); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * On chipsets with HMX, sets/resets the HMX request parameter in the attribute + * structure for acquiring the HMX resource. + * + * Call this function after HAP_compute_res_attr_init(). + * + * Supported starting with Lahaina. + * + * @param[in] attr Pointer to compute the resource attribute structure, + * #compute_res_attr_t. + * @param[in] b_enable 0 - do not request HMX resource (resets option). \n + * 1 - request HMX resource (sets option). + * @return + * 0 upon success. \n + * Nonzero upon failure. + */ +static inline int HAP_compute_res_attr_set_hmx_param( + compute_res_attr_t* attr, + unsigned char b_enable) +{ + if (compute_resource_attr_set_hmx_param) + { + return compute_resource_attr_set_hmx_param(attr, + b_enable); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Sets or resets cacheable mode in the attribute structure. + * + * A cacheable request allows users to allocate and release based on the + * context ID of the request. On a successful cacheable request via + * HAP_compute_res_acquire(), users get the same VTCM address and + * size across calls of HAP_compute_res_acquire_cached() and + * HAP_compute_res_release_cached() until the context is explicitly + * released via HAP_compute_res_release(). + * + * After a successful cacheable request via HAP_compute_res_acquire(), + * users can get the assigned VTCM pointer (if requested) by passing + * the attribute structure to HAP_compute_res_attr_get_vtcm_ptr() + * for v1 and HAP_compute_res_attr_get_vtcm_ptr_v2() for v2, + * and they must call HAP_compute_res_acquire_cached() before using the + * assigned resources. + * + * Supported starting with Lahaina. + * + * @param[in] attr Pointer to compute resource attribute structure, + * #compute_res_attr_t. + * @param[in] b_enable 0 - Do not request cacheable mode (resets option). \n + * 1 - Request cacheable mode (sets option). + * + * @return + * 0 upon success. \n + * Nonzero upon failure.\n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_attr_set_cache_mode( + compute_res_attr_t* attr, + unsigned char b_enable) +{ + if (compute_resource_attr_set_cache_mode) + { + return compute_resource_attr_set_cache_mode(attr, + b_enable); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Sets the application ID parameter in the resource structure used to + * select the appropriate VTCM partition. + * + * If this application ID parameter is not explicitly set, the default partition is selected. + * The default application ID (0) is set when the attribute structure is initialized. + * Application IDs are defined in the kernel device tree configuration. + * If the given ID is not specified in the tree, the primary VTCM partition is selected. + * + * Call this function after HAP_compute_res_attr_init(). + * + * Supported starting with Lahaina. + * + * @param[in] attr Pointer to compute the resource attribute structure + * #compute_res_attr_t. + * @param[in] application_id Application ID used to specify the VTCM partition. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. + */ +static inline int HAP_compute_res_attr_set_app_type( + compute_res_attr_t* attr, + unsigned int application_id) +{ + if (compute_resource_attr_set_app_type) + { + return compute_resource_attr_set_app_type(attr, + application_id); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * @} + */ + + +/** +* @defgroup query VTCM query API +* @{ +*/ + +/** + * Returns the total and available VTCM sizes and page layouts + * for the given application type. + * + * Supported starting with Lahaina. + * + * @param[in] application_id Application ID used to specify the VTCM partition. + * @param[out] total_block_size Total VTCM size assigned for this application type. + * @param[out] total_block_layout Total VTCM size (total_block_size) + * represented in pages. + * @param[out] avail_block_size Largest contiguous memory chunk available in + VTCM for this application type. + * @param[out] avail_block_layout Available block size (avail_block_size) + * represented in pages. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_query_VTCM( + unsigned int application_id, + unsigned int* total_block_size, + compute_res_vtcm_page_t* total_block_layout, + unsigned int* avail_block_size, + compute_res_vtcm_page_t* avail_block_layout) +{ + if (compute_resource_query_VTCM) + { + return compute_resource_query_VTCM(application_id, + total_block_size, + total_block_layout, + avail_block_size, + avail_block_layout); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * @} + */ + +/** + * @defgroup acquire_release Acquire and release + * Manage the process of resource acquisition and release + * @{ + */ + +/** + * Checks the release request status for the provided context. + * When a context is acquired by providing a release callback, the callback + * can be invoked by the compute resource manager when a high priority client + * is waiting for the resource(s). If a client defers a release request waiting + * for an outstanding work item, this API can be used to check if a release is + * still required before releasing the context once the work is done. + * + * Note: It is not mandatory to call this API once a release request via + * the registered callback is received. The context can be released and reacquired + * if necessary. This API can be useful to avoid a release and reacquire in cases + * where the high priority client times out and is no longer waiting for the + * resource(s). + * + * Supported starting with Lahaina. + * + * @param[in] context_id Context ID returned by HAP_compute_res_acquire call(). + * + * @return + * 0 if the provided context need not be released. \n + * Nonzero up on failure or if the context needs to be released. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. \n + */ +static inline int HAP_compute_res_check_release_request( + unsigned int context_id) +{ + if (compute_resource_check_release_request) + { + return compute_resource_check_release_request(context_id); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Accepts a prepared attribute structure (attr) and returns a context ID + * for a successful request within the provided timeout (microseconds). + * + * @param[in] attr Pointer to compute the resource attribute structure + * #compute_res_attr_t. + * @param[in] timeout_us Timeout in microseconds; 0 specifies no timeout + * i.e., requests with unavailable resources + * immediately return failure. If nonzero, should + * be at least 200. + * + * @return + * Nonzero context ID upon success. \n + * 0 upon failure (i.e., unable to acquire requested resource + * in a given timeout duration). + */ +static inline unsigned int HAP_compute_res_acquire( + compute_res_attr_t* attr, + unsigned int timeout_us) +{ + if (compute_resource_acquire) + { + return compute_resource_acquire(attr, timeout_us); + } + + return 0; +} + +/** + * Releases all the resources linked to the given context ID. + * + * Call this function with the context_id returned by a successful + * HAP_compute_res_acquire call(). + * + * @param[in] context_id Context ID returned by HAP_compute_res_acquire call(). + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_release(unsigned int context_id) +{ + if (compute_resource_release) + { + return compute_resource_release(context_id); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Acquires or reacquires the resources pointed to the context_id returned by + * a successful HAP_compute_res_acquire() call. If VTCM resource was requested, + * the VTCM address, size, and page configuration remain the same. + * + * Supported from Lahaina. + * + * @param[in] context_id Context ID returned by HAP_compute_res_acquire(). + * @param[in] timeout_us Timeout in microseconds; 0 specifies no timeout + * i.e., requests with unavailable resources + * immediately return failure. If nonzero, should + * be at least 200. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_acquire_cached( + unsigned int context_id, + unsigned int timeout_us) +{ + if (compute_resource_acquire_cached) + { + return compute_resource_acquire_cached(context_id, timeout_us); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Releases all the resources pointed to by the context_id acquired + * by a successful HAP_compute_res_acquire_cached() call, while allowing the + * user to reacquire the same resources via HAP_compute_res_acquire_cached() + * in the future until the context is released via HAP_compute_res_release(). + * + * Supported starting with Lahaina. + * + * @param[in] context_id Context ID returned by + * #HAP_compute_res_acquire(). + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_release_cached(unsigned int context_id) +{ + if (compute_resource_release_cached) + { + return compute_resource_release_cached(context_id); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Sets the release callback function in the attribute structure. + + * The compute resource manager calls the release_callback function when any of the + * resources reserved by the specified context are required by a higher priority + * client. Clients act on the release request by explicitly calling the release + * function HAP_compute_res_release() or HAP_compute_res_release_cached() + * to release all acquired resources of the given context_id. + * + * Client-provided context (client_context) is passed to the release callback. On + * receiving a release request via the provided callback, clients should call the + * release function within 5 milliseconds. The release_callback function + * should not have any blocking wait. + * + * Call this function after HAP_compute_res_attr_init(). + * + * Supported starting with Lahaina. + * + * @param[in] attr Pointer to compute the resource attribute structure, + * #compute_res_attr_t. + * @param[in] release_callback Function pointer to the registered callback to + receive the release request. + * @param[in] client_context User-provided client context. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_attr_set_release_callback( + compute_res_attr_t* attr, + int (*release_callback)( + unsigned int context_id, + void* client_context), + void* client_context) +{ + if (compute_resource_attr_set_release_callback) + { + return compute_resource_attr_set_release_callback(attr, + release_callback, + client_context); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Updates the priority of an allocated context reflecting the caller's + * thread priority. + * The compute resource manager uses the callers thread priority as the resource + * priority when acquired (HAP_compute_res_acquire() / + * HAP_compute_res_acquire_cached()). If the thread priority of the caller is + * changed after acquiring the resource, caller should intimate the compute + * resource manager of a priority change by invoking this API. Failing to do + * so will result in resource manager assuming an incorrect priority for + * the allocated resource which may result in unwanted release requests. + * For a cached allocation, this API should be called after a successful + * HAP_compute_res_acquire_cached() call. + * + * Supported on latest chipsets(released after Palima). + * + * @param[in] context_id Context ID returned by HAP_compute_res_acquire().. + * @param[in] priority 0 - The compute resource manager would use the caller + * thread priority + * 1..255 - priority value in terms of QuRT thread priority. + * Priority ceiling will be applied for unprivileged + * processes. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_update_priority(unsigned int context_id, + unsigned short priority) +{ + if (compute_resource_update_priority) + { + return compute_resource_update_priority(context_id, priority); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * @} + */ + +/** + * @defgroup Critical section for autonomous thread id preemption + * + * API to enter and exit critical section to prevent autonomous thread identifiers + * based preemption (HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION) from + * resource manager when acquiring global mutexes (used + * in I/O, standard library functions like printf, user implemented + * serialization etc.) + * + * @{ + */ + +/** + * API to enter critical section to prevent autonomous thread identifiers + * based preemption (HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION) from + * resource manager when acquiring global mutexes (used + * in I/O, standard library functions like printf, user implemented + * serialization etc.) + * + * On architectures supporting HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION, + * holding global mutexes can lead to deadlocks within the preempted task's + * user process. The critical section exposed by this API should be implemented + * by users around I/O, logging or any standard libraries/user implementations + * which acquires global mutexes. + * + * Implementation uses a per-process global mutex, callers of this API will + * be serialized across threads within the caller user process on NSP. + * + * NOTE: The critical section implementation should only be done when, + * - HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION is supported + * - Applications with different priorities co-exist in a single user process + * exposing the risk of deadlock between a running and preempted + * application. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + * + */ + +static inline int HAP_compute_res_tid_preemption_lock(void) +{ + if (crm_tid_preemption_lock) + { + return crm_tid_preemption_lock(); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Releases the critical section acquired by #HAP_compute_res_tid_preemption_lock(). + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ + +static inline int HAP_compute_res_tid_preemption_unlock(void) +{ + if (crm_tid_preemption_unlock) + { + return crm_tid_preemption_unlock(); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * @} + */ + +/** + * @defgroup Capability query and profiling data + * API to query capabilities of the compute resource manager and to get + * profiling data associated with a context. + * + * @{ + */ + +/** + * Queries compute resource manager capabilities listed under + * compute_res_capability_id enum. + * + * @param[in] capability_id Identifier from compute_res_capability_id corresponding + * to the compute resource manager capability. + * @param[out] data Pointer to an unsigned int data. On success, the memory + * is updated with the data associated with the queried capability. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_query_capability(compute_res_capability_id capability_id, + unsigned int* data) +{ + if (crm_query_capability) + { + return crm_query_capability(capability_id, data); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * On implementations supporting HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION, + * this API returns preemption statistics associated with the context_id + * acquired via HAP_compute_res_acquire(). + * + * This API needs to be called before the associated context is released via + * HAP_compute_res_release() call, data returned is invalid otherwise. + * + * @param[in] context_id Context ID returned by HAP_compute_res_acquire(). + * @param[out] Pointer to compute_res_preempt_data_t. + * On success, the preemption-related statistics are updated in + * the provided structure. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_get_preempt_data(unsigned int context_id, + compute_res_preempt_data_t* data) +{ + if (crm_get_preempt_data) + { + return crm_get_preempt_data(context_id, data); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * @} + */ + + +/** + * @defgroup HMX HMX lock and unlock + * Manage HMX lock once HMX has been acquired + * + * @{ + */ + +/** + * Locks the HMX unit to the current thread and prepares the thread to + * execute HMX instructions. The client must have already acquired the + * HMX resource with HAP_compute_res_acquire() or HAP_compute_res_acquire_cached(), + * and context_id must refer to the corresponding resource manager context. + * + * Before executing HMX instructions, a client must call this function from + * the same software thread used for HMX processing. Only the calling thread + * with a valid HMX lock may execute HMX instructions. + * + * Supported starting with Lahaina. + * + * @param[in] context_id Context ID returned by + * #HAP_compute_res_acquire(). + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_hmx_lock(unsigned int context_id) +{ + if (compute_resource_hmx_lock) + { + return compute_resource_hmx_lock(context_id); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * Unlocks the HMX unit from the calling thread. The HMX unit can then be + * locked to another thread or released with HAP_compute_res_release(). + * + * This function must be called from the same thread as the previous + * HMX_compute_res_hmx_lock() call. + * + * Supported starting with Lahaina. + * + * @param[in] context_id Context ID returned by + * #HAP_compute_res_acquire(). + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_hmx_unlock(unsigned int context_id) +{ + if (compute_resource_hmx_unlock) + { + return compute_resource_hmx_unlock(context_id); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * This function is an extension to HAP_compute_res_hmx_lock() with an additional + * option to lock HMX across multiple participating threads within a user process + * and timeshare the HMX resource (only one thread should be using HMX at a time). + * + * Supported on latest chipsets(released after Palima). + * + * @param[in] context_id Context ID returned by + * #HAP_compute_res_acquire(). + * @param[in] tye HAP_COMPUTE_RES_HMX_NON_SHARED + * Analogous to #HAP_compute_res_hmx_lock() + * HAP_COMPUTE_RES_HMX_SHARED + * Threads within a process can lock and timeshare the same HMX + * resource. When using this option, it is caller's responsibility + * to timeshare HMX (only one thread should use HMX at a time) + * among participating threads using HAP_COMPUTE_RES_HMX_SHARED + * option from the same process. + * Note that the sharing of HMX is allowed between the threads of + * the same user process. A single Context ID (context_id) should be + * used across the participating threads in a user process. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_hmx_lock2(unsigned int context_id, + compute_res_hmx_type_t type) +{ + if (compute_resource_hmx_lock2) + { + return compute_resource_hmx_lock2(context_id, type); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * To be used in conjunction with HAP_compute_res_hmx_lock2() to release a successfully + * locked HMX unit. + * 'type' provided should match with the type provided to a successful + * HAP_compute_res_hmx_lock2() call from this thread. + * + * Supported on latest chipsets(released after Palima). + * + * @param[in] context_id Context ID returned by + * #HAP_compute_res_acquire(). + * + * @param[in] type Should be the same paramter used to lock HMX + * via #HAP_compute_res_hmx_lock2() + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_hmx_unlock2(unsigned int context_id, + compute_res_hmx_type_t type) +{ + if (compute_resource_hmx_unlock2) + { + return compute_resource_hmx_unlock2(context_id, type); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * @} + */ + +/** + * @defgroup HMX HMX lock and unlock + * Manage HMX on architectures supporting HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION + * + * @{ + */ + +/** + * On architectures supporting HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION preemption, + * this funciton locks the HMX unit to the current thread and prepares the thread to + * execute HMX instructions. The client must have already acquired the + * VTCM using HAP_compute_res_acquire() or HAP_compute_res_acquire_cached(), + * and context_id must refer to the corresponding resource manager context. + * + * Before executing HMX instructions, a client must call this function from + * the same software thread used for HMX processing. Only the calling thread + * with a valid HMX lock may execute HMX instructions. + * + * The calling thread shall acquire lock on HMX mutex before executing HMX + * instructions and release the lock when program reaches to a point where the + * acquired HMX unit can be re-assigned to a higher priority waiter (in case of + * multiple clients contending for HMX resource) without affecting + * functionality. For entering HMX critical section, user shall call + * hmx_mutex->lock(hmx_mutex->mutex). For exiting the HMX critical section, user + * shall call hmx_mutex->unlock(hmx_mutex->mutex) + * autonomous preemption will wait for applications to release the HMX critical section + * before preempting HMX from the allocator. + * + * @param[in] context_id Context ID returned by + * #HAP_compute_res_acquire(). + * @param[in] type HAP_COMPUTE_RES_HMX_NON_SHARED + * Analogous to #HAP_compute_res_hmx_lock() + * HAP_COMPUTE_RES_HMX_SHARED + * Threads within a process can lock and timeshare the same HMX + * resource. When using this option, it is caller's responsibility + * to timeshare HMX (only one thread should use HMX at a time) + * among participating threads using HAP_COMPUTE_RES_HMX_SHARED + * option from the same process. + * Note that the sharing of HMX is allowed between the threads of + * the same user process. A single Context ID (context_id) should be + * used across the participating threads in a user process. + * @param[out] hmx_mutex Pointer to structure of type compute_res_hmx_mutex_t. + * On Success, the structure is updated with mutex, lock + * and unlock parameters. + * @param[in] timeout_us Timeout in microseconds; 0 specifies no timeout + * i.e., requests with unavailable resources + * immediately return failure. If nonzero, should + * be at least 200. + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_hmx_lock3(unsigned int context_id, + compute_res_hmx_type_t type, + compute_res_hmx_mutex_t *hmx_mutex, + unsigned int timeout_us) +{ + if (crm_hmx_lock3) + { + return crm_hmx_lock3(context_id, type, hmx_mutex, timeout_us); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * To be used in conjunction with HAP_compute_res_hmx_lock3() to release a + * successfully locked HMX unit. + * 'type' provided should match with the type provided to a successful + * #HAP_compute_res_hmx_lock3() call from this thread. + * + * @param[in] context_id Context ID returned for a successful VTCM acquisition by + * #HAP_compute_res_acquire(). + * + * @param[in] type Should be the same parameter used to lock HMX + * via #HAP_compute_res_hmx_lock3() + * + * @param[in] hmx_mutex Should be the same parameter used to lock HMX via + * #HAP_compute_res_hmx_lock3() + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. + */ +static inline int HAP_compute_res_hmx_unlock3(unsigned int context_id, + compute_res_hmx_type_t type, + compute_res_hmx_mutex_t *hmx_mutex) +{ + if (crm_hmx_unlock3) + { + return crm_hmx_unlock3(context_id, type, hmx_mutex); + } + + return HAP_COMPUTE_RES_NOT_SUPPORTED; +} + +/** + * @} + */ + +#ifdef __cplusplus +} +#endif + +#endif //HAP_COMPUTE_RES_H_ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.md new file mode 100755 index 0000000000000..3b550bf9ed20c --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.md @@ -0,0 +1,635 @@ +# Compute resource manager framework + +The cDSP has several shared resources such as L2 cache, HVX, HMX (where available), VTCM, hardware threads, and memory +buses. The compute resource manager framework exposes in @b HAP_compute_res.h a set of APIs for managing, requesting +and releasing some of these resources. + +## Legacy HAP_vtcm_mgr API + +VTCM allocation APIs exposed under [VTCM Manager](../../doxygen/HAP_vtcm_mgr/index.html) are being deprecated, we +recommend using the compute resource APIs for VTCM management and allocation. The compute resource manager provides +options to: + +* Query defined VTCM on an architecture and VTCM usage. +* Cached mode: Release and reacquire the same VTCM virtual address, size and page configuration. +* Cooperative preemption: Register release callbacks, which might be invoked when a high-priority client needs a resource +used by a lower-priority client. +* ThreadID based autonomous preemption: Register threads that work on the compute resources with resource manager, these +threads will be suspended by the resource manager when a high priority client requests. Clients also provide a backup buffer +for VTCM, used by the resource manager to save and restore VTCM context during preemption. +* Query supported preemption model (cooperative, ThreadID based autonomous preemption etc.) + +## Serialization + +The resource manager also offers a virtualized serialization resource to aid concurrencies in which constituent use cases +are to run with mutually exclusive access to the entire cDSP, for example, to avoid cache thrashing with each other. +Participating use cases issue blocking acquires on the serialization resource when ready to run, and each use case runs +in turn when it is granted that resource. Acquiring the serialization resource only ensures mutual exclusion from other +cooperating use cases that also block on acquisition of that resource; it does not guarantee exclusion from concurrent +use cases that do not block on the serialization resource. + +## Cached mode + +Clients requesting for VTCM are provided with a pointer (virtual address) to VTCM on success. The pointer to VTCM can +change once it's released (HAP_compute_res_release()) and re-acquired (HAP_compute_res_acquire()). Clients requiring a +constant VTCM pointer through out a session can use the cached mode. Cached mode can be enabled by setting cached attribute +using HAP_compute_res_attr_set_cache_mode() when acquiring (HAP_compute_res_acquire()) the resource. When cached attribute +is set while acquiring the resource, clients are expected to call HAP_compute_res_acquire_cached() with the context ID +returned by HAP_compute_res_acquire() before accessing the resource. + +This mode is useful for periodic applications where VTCM pointer needs to remain the same at every execution while allocating +and releasing the resource periodically: +* HAP_compute_res_acquire() with cached attribute set is called for allocating VTCM during initialization. +* HAP_compute_res_acquire_cached() and HAP_compute_res_release_cached() called before and after every execution. +* HAP_compute_res_release() is called during de-initialization. + +From v73 architecture, cached mode also provides clients with an option to have an overlapping mapping from within a process. + +### overmapping / overlapping page mapping + +Applications working on HMX may require all of the requested VTCM to be in a single page mapping in MMU. When overmapping / +overlapping page mapping feature is supported, the HMX applications requesting for a page size covering entire VTCM with +a smaller VTCM size can allow other applications running from the same user process to allocate remaining VTCM size when +cached mode is used. + +For example, on an architecture supporting 8MB of VTCM, HMX application (APP1) requesting for 6MB of VTCM with a minimum of +8MB page in cached mode can allow another application (APP2) to acquire remaining 2MB of VTCM with a maximum page size of +1MB. + +![screenshot](../../images/CRM_VTCM_overmapping_example.png) + +Note: +* Only cached allocations requesting for VTCM page size covering entire VTCM defined for that architecture but with a +smaller VTCM size request will result in overmapping condition. For example, on architecture with 8MB VTCM, a cached/ +non-cached request for 3MB VTCM with 4MB page size will get a 4MB allocation (3MB wrapped to the page size). +* Multiple cached/non-cached allocations from within the same process (as overmapping client) can use the left over space +in VTCM as long as their requests can be accomodated in that space. For example, a 2MB size request with single page/ +4MB page size cannot coexist concurrently with a cached 6MB request with 8MB page size. +* Multiple overmapping clients cannot coexist concurrently. For example, a 4MB size with 8MB page request cannot coexist +concurrently with another 4MB sized request with 8MB page. + +## VTCM window feature +Starting with v79, the NSP has a VTCM window hardware feature, which can be used to prevent a thread from accessing a specific VTCM region. The compute resource manager utilizes this feature as an additional access control layer on top of the page mappings per process. +### Use case: +* Defined VTCM memory: 8MB +* Process 1: VTCM memory request 6MB, single page mapping +* Process 2: VTCM memory request 2MB + +In this use case, 6MB of VTCM to be mapped in a single page requires all of 8MB VTCM to be allocated. The difference between architectures with or without the VTCM window feature is in how the remaining 2 MB of VTCM allocated but unused by process 1 may be used by another process. + +### Q6 architecture < V79 (Where VTCM window feature not available) +As the entire 8MB of defined VTCM is mapped to the user process, the allocating user process, process 1, has access to the 2MB free space as well. The 2MB is marked free for other allocations from the same user process and not available for requests from other user processes. + +![screenshot](../../images/hap_compute_res_mgr_no_vtcm_window.png) + +Process 2 cannot use the 2MB of free space while process 1 still holds its allocation. + +### Q6 architecture >= V79 (Where VTCM window feature available) +The compure resource manager restricts the allocating user process, process 1, to access only 6MB of allocated space using `VTCM window` hardware feature. This allows other user processes to access this 2MB region. In this use case, process 1 has neither read nor write access to the remaining 2 MB of VTCM. + +![screenshot](../../images/hap_compute_res_mgr_vtcm_window.png) + +### VTCM window - restrictions +`VTCM window` can be useful to restrict threads within a process to desired VTCM regions. `VTCM window` should be a single contiguous memory region within the VTCM space: gaps inbetween allocations cannot be free for allocation from other user processes. For example, in the below scenario, the `VTCM window` cannot be used to allow other processes to allocate the 1MB of unallocated free space: it is only available for allocation from process 1. + +![screenshot](../../images/hap_compute_res_mgr_vtcm_window_restrictions.png) + +## Cooperative preemption framework + +The resource manager offers cooperative preemption framework where in clients can register a release callback when +requesting for compute resources using HAP_compute_res_attr_set_release_callback(). When a higher-priority client requests +a resource already in use by a lower-priority client, the lower-priority client will be notified by the callback to suspend +its work and release the resource. + +## Autonomous preemption framework (threadId based) + +On supported architecutres (can be queried using HAP_compute_res_query_capability()), the resource manager implements +an autonomous based preemption framework where clients register thread IDs associated with a resource request and provide +VTCM backup buffer when VTCM is being requested. As part of preempting a context, the resource manager waits for HMX critical section +when HMX is in used, suspends registered threads and saves VTCM in provided backup buffer. When the resource becomes available +the resource manager resumes suspended threads after restoring VTCM and reattaching HMX (if previously assigned). + +HMX under this preemption scheme is handled differently in comparison to the cooperative preemption framework. +In cooperative preemption framework, HMX as a resource is acquired first and then locked using HAP_compute_res_hmx_lock()/lock2() while +in autonomous preemption framework, HMX is directly locked via HAP_compute_res_hmx_lock3() using the context returned by +a successful VTCM allocation done using HAP_compute_res_acquire() call. As the resource manager can preempt a low +priority client, HMX applications need to implement HMX critical section using the mutex structure returned by a successful +HAP_compute_res_HMX_lock3() API. + +## Usage examples + +### Cached VTCM request - cooperative preemption + +@code +int release_callback(unsigned int context, void *state) +{ + if (!context || !state) return FAILURE; + /* + * Got release request, set release required in state variable + */ + application_state_t* local_state = (application_state_t *)state; + if (local_state->context != context) return FAILURE; + local_state->release_request = TRUE; + return 0; +} + +void initialization_routine() +{ + compute_res_attr_t attr; + unsigned int context; + unsigned int vtcm_size = 8 * 1024 * 1024; //8MB of VTCM + void *p_vtcm = NULL; + unsigned int result_vtcm_size = 0; + /* + * Initialize the attribute structure + */ + if (0 != HAP_compute_res_attr_init(&attr)) + return; + /* + * Query VTCM defined in the architecture and set our request VTCM size + * to the defined one (request for entire VTCM size) + */ + if (0 != HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL)) + return; + /* + * Set VTCM params: + * Requesting for entire VTCM size, minimum page size set to VTCM size, + * minimum required VTCM size is set to the same as VTCM size + */ + if (0 != HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size)) + return; + /* + * Set cached mode + */ + if (0 != HAP_compute_res_attr_set_cache_mode(&attr, 1)) + return; + /* + * Set release callback + */ + if (0 != HAP_compute_res_attr_set_release_callback(&attr, &release_callback, (void *)state)) + return; + /* + * Acquire a context with the prepared attribute structure + */ + if (0 == (context = HAP_compute_res_acquire(&attr, 0))) + return; + /* + * Get VTCM pointer + */ + if (0 != HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &p_vtcm, &result_vtcm_size)) + { + HAP_compute_res_release(context); + return; + } + state->context = context; + /* + * Setup algorithm using p_vtcm and result_vtcm_size + */ + return; + } + +int yield(unsigned int context) +{ + /* + * Synchronize with workers to make sure all accesses to VTCM are complete + * Backup VTCM if required + * Release context and reacquire + */ + if (0 == HAP_compute_res_check_release_request(context)) + return FAILURE; + if (0 == HAP_compute_res_release_cached(context)) + return FAILURE; + if (0 == HAP_compute_res_acquire_cached(context, )) + return FAILURE; + /* + * Restore VTCM and continue remaining work + */ + return 0; +} + +void execution_loop() + /* + * Acquire the cached resource + */ + if (0 != HAP_compute_res_acquire_cached(context, )) + return; + /* + * Work items + */ + for (i = 0; i < WORK_ITEMS; i++) + { + /* + * Check if cooperative preemption requested for a release + * param set in release_callback + */ + if (state->release_request) + { + if (0 != yield(context)) + return; + } + //Execute work item + } + /* + * Release the cached resource + */ + if (0 != HAP_compute_res_release_cached(context)) + return; +} +@endcode + +### Cached VTCM request - autonomous threadID based preemption + +@code +int release_callback(unsigned int context, void *state) +{ + if (!context || !state) return FAILURE; + /* + * Got release request, set release required in state variable + */ + application_state_t* local_state = (application_state_t *)state; + if (local_state->context != context) return FAILURE; + local_state->release_request = TRUE; + return 0; +} + +int check_autonomous_threads_compute_res_capability() +{ + unsinged int capability = 0; + + if (0 != HAP_compute_res_query_capability(HAP_COMPUTE_RES_PREEMPTION_CAPABILITY, &capability)) + return FAILURE; + if (capability & HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION) + return 0; + else + return FAILURE; +} + +void initialization_routine() +{ + compute_res_attr_t attr; + unsigned int context; + unsigned int vtcm_size = 8 * 1024 * 1024; //8MB of VTCM + void *p_vtcm = NULL, *p_vtcm_backup = NULL; + unsigned int result_vtcm_size = 0; + unsigned int thread_id = NULL; + /* + * Initialize the attribute structure + */ + if (0 != HAP_compute_res_attr_init(&attr)) + return; + /* + * Query VTCM defined in the architecture and set our request VTCM size + * to the defined one (request for entire VTCM size) + */ + if (0 != HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL)) + return; + /* + * Set VTCM params: + * Requesting for entire VTCM size, minimum page size set to VTCM size, + * minimum required VTCM size is set to the same as VTCM size + */ + if (0 != HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size)) + return; + /* + * Set cached mode + */ + if (0 != HAP_compute_res_attr_set_cache_mode(&attr, 1)) + return; + /* + * Check threads based autonomous preemption support and register threads + */ + if (0 == check_autonomous_threads_compute_res_capability()) + { + /* + * Allocate backup buffer for VTCM to be registered with the resource + * manager + */ + p_vtcm_backup = malloc(vtcm_size); + /* + * Register VTCM backup buffer + */ + if (0 != HAP_compute_res_attr_set_vtcm_backup(&attr, p_vtcm_backup, vtcm_size)) + { + free(p_vtcm_backup); + return; + } + /* + * Register threads that will be working on the requested VTCM buffer + */ + thread_id = qurt_thread_get_id(); + if (0 != HAP_compute_res_attr_set_threads(&attr, &thread_id, 1)) + { + free(p_vtcm_backup); + return; + } + } else { + /* + * Falling back to cooperative preemption when autonomous preemption + * is not supported + */ + if (0 != HAP_compute_res_attr_set_release_callback(&attr, &release_callback, (void *)state)) + return; + } + /* + * Acquire a context with the prepared attribute structure + */ + if (0 == (context = HAP_compute_res_acquire(&attr, 0))) + return; + /* + * Get VTCM pointer + */ + if (0 != HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &p_vtcm, &result_vtcm_size)) + { + HAP_compute_res_release(context); + return; + } + state->context = context; + /* + * Setup algorithm using p_vtcm and result_vtcm_size + */ + return; + } + +int yield(unsigned int context) +{ + /* + * Synchronize with workers to make sure all accesses to VTCM are complete + * Backup VTCM if required + * Release context and reacquire + */ + if (0 == HAP_compute_res_check_release_request(context)) + return FAILURE; + if (0 == HAP_compute_res_release_cached(context)) + return FAILURE; + if (0 == HAP_compute_res_acquire_cached(context, )) + return FAILURE; + /* + * Restore VTCM and continue remaining work + */ + return 0; +} + +void execution_loop() + /* + * Acquire the cached resource + */ + if (0 != HAP_compute_res_acquire_cached(context, )) + return; + /* + * Work items + */ + for (i = 0; i < WORK_ITEMS; i++) + { + /* + * Check if cooperative preemption requested for a release + * param set in release_callback + */ + if (state->release_request) + { + if (0 != yield(context)) + return; + } + //Execute work item + } + /* + * Release the cached resource + */ + if (0 != HAP_compute_res_release_cached(context)) + return; +} +@endcode + +### Serialized VTCM acquisition + +This example shows two threads requesting VTCM and both participating in serialization by invoking HAP_compute_res_attr_set_serialize(). + +@code + /* + * PROCESS/THREAD 1 + */ + compute_res_attr_t res_info; + unsigned int context_id = 0; + void *p_vtcm = NULL; + /* + * Initialize the attribute structure + */ + if (0 != HAP_compute_res_attr_init(&res_info) ) + return; + /* + * Set serialization option + */ + if (0 != HAP_compute_res_attr_set_serialize(&res_info, 1) ) + return; + /* + * Set VTCM request parameters - 256KB single page + */ + if (0 != HAP_compute_res_attr_set_vtcm_param(&res_info, + (256 * 1024), + 1) ) + return; + /* + * Call acquire with a timeout of 10 milliseconds. + */ + if (0 != (context_id = HAP_compute_res_acquire(&res_info, 10000) ) ) + { + /* + * Successfully requested for serialization and acquired VTCM. + * The serialization request from PROCESS/THREAD 2 waits + * until the resource is released here. + */ + p_vtcm = HAP_compute_res_attr_get_vtcm_ptr(&res_info); + if (0 == p_vtcm) + { + /* + * VTCM allocation failed, should not reach here as the acquire + * returned with valid context ID. + */ + HAP_compute_res_release(context_id); + return; + } + //Do my work in process/thread 1 + /* + * Done. Release the resource now using the acquired context ID. + * This releases both the serialization request and VTCM allocation. + */ + HAP_compute_res_release(context_id); + p_vtcm = NULL; + } else { + /* + * Unsuccessful allocation. Timeout would have triggered. + * Implement a fallback or fail gracefully. + */ + } + + ... + + /* + * PROCESS/THREAD 2 + */ + compute_res_attr_t res_info; + unsigned int context_id = 0; + /* + * Initialize the attribute structure. + */ + if (0 != HAP_compute_res_attr_init(&res_info) ) + return; + /* + * Set serialization option. + */ + if (0 != HAP_compute_res_attr_set_serialize(&res_info, 1) ) + return; + /* + * Call acquire with a timeout of 10 milliseconds. + */ + if (0 != (context_id = HAP_compute_res_acquire(&res_info, 10000) ) ) + { + /* + * Successfully requested for serialization. + * The serialization request from PROCESS/THREAD 1 waits + * until the resource is released here even when the PROCESS/THREAD 1s + * request for VTCM can be served. + */ + //Do my work in process/thread 2 + /* + * Done. Release the resource now using the acquired context ID. + */ + HAP_compute_res_release(context_id); + } else { + /* + * Unsuccessful allocation. Timeout would have triggered. + * Implement a fallback or fail gracefully. + */ + } +@endcode + +### Non-serialized VTCM acquisition + +This example shows two threads requesting VTCM alone without a serialization option. + +If the total size requested by both threads exceeds the size of VTCM that is available, only one thread gets +access to VTCM while the other thread waits. In this case, the threads are serializing their workload +implicitly. + +If enough VTCM memory is available to meet the requests of both threads, both threads acquire VTCM upon request +and can end up executing in parallel. + +@code + /* + * PROCESS/THREAD 1 + */ + compute_res_attr_t res_info; + unsigned int context_id = 0; + void *p_vtcm = NULL; + /* + * Initialize the attribute structure. + */ + if (0 != HAP_compute_res_attr_init(&res_info) ) + return; + + /* By not calling HAP_compute_res_attr_set_serialize, we enable thread 1 to acquire VTCM + * as long as enough memory is available + */ + + /* + * Set VTCM request parameters - 256 KB single page + */ + if (0 != HAP_compute_res_attr_set_vtcm_param(&res_info, + (256 * 1024), + 1) ) + return; + /* + * Call acquire with a timeout of 10 milliseconds. + */ + if (0 != (context_id = HAP_compute_res_acquire(&res_info, 10000) ) ) + { + /* + * Successfully acquired VTCM. + * The VTCM request from PROCESS/THREAD 2 waits if enough + * VTCM is not left to serve the request until the resource is released + * here. + */ + p_vtcm = HAP_compute_res_attr_get_vtcm_ptr(&res_info); + if (0 == p_vtcm) + { + /* + * VTCM allocation failed, should not reach this point as the acquire + * returned with valid context ID. + */ + HAP_compute_res_release(context_id); + return; + } + //Do my work in process/thread 1 + /* + * Done. Release the resource now using the acquired context ID. + * This releases the VTCM allocation. + */ + HAP_compute_res_release(context_id); + p_vtcm = NULL; + } else { + /* + * Unsuccessful allocation. Timeout would have triggered. + * Implement a fallback or fail gracefully. + */ + } + + ... + + /* + * PROCESS/THREAD 2 + */ + compute_res_attr_t res_info; + unsigned int context_id = 0; + void *p_vtcm = NULL; + /* + * Initialize the attribute structure + */ + if (0 != HAP_compute_res_attr_init(&res_info) ) + return; + + /* By not calling HAP_compute_res_attr_set_serialize, we enable thread 2 to acquire VTCM + * as long as enough memory is available + */ + + /* + * Set VTCM request parameters - 256 KB single page. + */ + if (0 != HAP_compute_res_attr_set_vtcm_param(&res_info, + (256 * 1024), + 1) ) + return; + /* + * Call acquire with a timeout of 10 milliseconds. + */ + if (0 != (context_id = HAP_compute_res_acquire(&res_info, 10000) ) ) + { + /* + * Successfully acquired VTCM. + * The VTCM request from PROCESS/THREAD 1 waits if enough + * VTCM is not left to serve the request until the resource is released + * here. + */ + p_vtcm = HAP_compute_res_attr_get_vtcm_ptr(&res_info); + if (0 == p_vtcm) + { + /* + * VTCM allocation failed, should not reach this point as the acquire + * returned with valid context ID. + */ + HAP_compute_res_release(context_id); + return; + } + //Do work in PROCESS/THREAD 2 + /* + * Done. Release the resource now using the acquired context ID. + * This releases the VTCM allocation. + */ + HAP_compute_res_release(context_id); + p_vtcm = NULL; + } else { + /* + * Unsuccessful allocation. Timeout would have triggered. + * Implement a fallback or fail gracefully. + */ + } +@endcode + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.h new file mode 100755 index 0000000000000..34159800c5227 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.h @@ -0,0 +1,332 @@ +/*----------------------------------------------------------------------------- + Copyright (c) 2021, 2022 QUALCOMM Technologies, Incorporated. + All Rights Reserved. + QUALCOMM Proprietary. +-----------------------------------------------------------------------------*/ + +#ifndef HAP_DCVS_H_ +#define HAP_DCVS_H_ + +/** + * @file HAP_dcvs.h + * @brief Header file for DCVS APIs. + */ + +#include "AEEStdErr.h" +#include "HAP_power.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Perf modes to specify core/bus clock frequency level within + * target voltage corner for HAP DCVS V3 interface. + */ +typedef enum { + HAP_DCVS_CLK_PERF_HIGH, /**< To select max frequency at target voltage corner. */ + HAP_DCVS_CLK_PERF_LOW, /**< To select min frequency at target voltage corner. */ +} HAP_dcvs_clk_perf_mode_t; + +/** + * @cond DEV + */ +int __attribute__((weak)) sysmon_set_dcvs_v3_duty_cycle( + void* context, + uint32 max_active_time, + uint32 periodicity); + +int __attribute__((weak)) sysmon_set_dcvs_v3_duty_cycle_params( + HAP_power_request_t* request, + uint32 max_active_time, + uint32 periodicity); + +int __attribute__((weak)) sysmon_set_dcvs_v3_core_perf_mode( + HAP_power_request_t* request, + HAP_dcvs_clk_perf_mode_t perf_mode); + +int __attribute__((weak)) sysmon_set_dcvs_v3_bus_perf_mode( + HAP_power_request_t* request, + HAP_dcvs_clk_perf_mode_t perf_mode); + +int __attribute__((weak)) sysmon_set_dcvs_v3_protected_bus_corners( + HAP_power_request_t* request, + unsigned char enable_protected_corners); + +int __attribute__((weak)) sysmon_set_ddr_perf_mode( + HAP_power_request_t *request, + unsigned int perf_mode); +/** + * @endcond + */ + +/** + * @defgroup helperapi Helper APIs for DCVS Duty Cycle + * @{ + */ + +/** + * Method to enable DCVS Duty Cycle. + * + * Calls HAP_power_set API with the provided context and selects + * DCVS duty cycle mode via HAP_power_set_DCVS_v3 request type. + * + * @param[in] context User context - power client identifier to be used in + * HAP_power_set call. + * + * @param[in] max_active_time Max active time allowed per frame in ms + * (optional, can pass 0 if don’t want to specify). + * DCVS selects appropriate operating levels to + * keep the activity time within the provided + * maximum allowed time. + * + * @param[in] periodicity Frame time in ms (optional, can pass 0 if + * don’t want to specify periodicity). For example, + * periodicity = 100 (milli-seconds) for a + * 10 FPS activity. DCVS uses this as a hint while + * predicting activity. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * AEE_EVERSIONNOTSUPPORT if unsupported. + */ +static inline int HAP_set_dcvs_v3_duty_cycle( + void* context, + uint32 max_active_time, + uint32 periodicity) +{ + if (sysmon_set_dcvs_v3_duty_cycle) + return sysmon_set_dcvs_v3_duty_cycle( + context, + max_active_time, + periodicity); + + return AEE_EVERSIONNOTSUPPORT; +} + +/** + * Method to set duty cycle threshold params (periodicity and activity time hints) + * in the request structure intended for HAP_power_set for request type set to + * HAP_power_set_DCVS_v3. + * + * Sets the max_active_time and periodicity fields under dcvs_v3 payload of given + * request structure. + * + * Note: Request type should be set to HAP_power_set_DCVS_v3. + * + * @param[in] request Pointer to request structure. + * + * @param[in] max_active_time Max active time allowed per frame in ms. + * DCVS selects appropriate operating levels to + * keep the activity time within the provided + * maximum allowed time. + * + * @param[in] periodicity Frame time in ms (optional, can pass 0 if + * don’t want to specify periodicity). For example, + * periodicity = 100 (milli-seconds) for a + * 10 FPS activity. DCVS uses this as a hint while + * predicting activity. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * AEE_EVERSIONNOTSUPPORT if unsupported. + */ +static inline int HAP_set_dcvs_v3_duty_cycle_params( + HAP_power_request_t* request, + uint32 max_active_time, + uint32 periodicity) +{ + if (sysmon_set_dcvs_v3_duty_cycle_params) + { + return sysmon_set_dcvs_v3_duty_cycle_params( + request, + max_active_time, + periodicity); + } + + return AEE_EVERSIONNOTSUPPORT; +} + +/** + * @} + */ + +/** + * @defgroup enable_protected_corner_api Helper API for protected bus corners + * + * @{ + */ +/** + * On chipsets supporting bus corners above HAP_DCVS_VCORNER_TURBO_PLUS, to optimize residency at these corners, + * target corner requests for bus are capped to HAP_DCVS_VCORNER_TURBO_PLUS by default. + * Any request beyond HAP_DCVS_VCORNER_TURBO_PLUS (including HAP_DCVS_VCORNER_MAX) will be wrapped to HAP_DCVS_VCORNER_TURBO_PLUS. + * + * This API enables clients of HAP_power_set to override this protection when voting explicitly for bus corners + * above HAP_DCVS_VCORNER_TURBO_PLUS in necessary use cases. + * + * Note: + * API is supported starting with V79 QDSP6 architecture, AEE_EVERSIONNOTSUPPORT error (can be safely ignored) is returned by the API when not supported. + * + * Request type should be set to HAP_power_set_DCVS_v3. + * + * @param[in] request Pointer to HAP_power_request_t structure with request type set to HAP_power_set_DCVS_v3. + * @param[in] enable_protected_corners 1 - to consider bus corner requests above HAP_DCVS_VCORNER_TURBO_PLUS + * 0 (default) - to cap bus corner requests to HAP_DCVS_VCORNER_TURBO_PLUS + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * AEE_EVERSIONNOTSUPPORT if unsupported. + */ + +static inline int HAP_set_dcvs_v3_protected_bus_corners( + HAP_power_request_t* request, + unsigned char enable_protected_corners) +{ + if (sysmon_set_dcvs_v3_protected_bus_corners) + { + return sysmon_set_dcvs_v3_protected_bus_corners(request, + enable_protected_corners); + } + + return AEE_EVERSIONNOTSUPPORT; +} + +/** + * @} + */ +/** + * @defgroup enable_ddr_perf_mode_api Helper API to enable DDR perf mode + * + * @{ + */ +/** + * This API enables clients of HAP_power_set to vote for DDR performance mode. + * + * Note: + * API is supported starting with V79 QDSP6 architecture, AEE_EVERSIONNOTSUPPORT error (can be safely ignored) is returned by the API when not supported. + * + * Note: Request type should be set to HAP_power_set_DCVS_v3. + * + * @param[in] request Pointer to HAP_power_request_t structure with request type set to HAP_power_set_DCVS_v3 + * + * @param[in] perf_mode 1 - to enable DDR performance mode + * 0 - to disable the DDR performance mode + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * AEE_EVERSIONNOTSUPPORT if unsupported. + */ +static inline int HAP_set_ddr_perf_mode( + HAP_power_request_t *request, + unsigned int perf_mode) +{ + if (sysmon_set_ddr_perf_mode) + { + return sysmon_set_ddr_perf_mode(request, perf_mode); + } + + return AEE_EVERSIONNOTSUPPORT; +} + +/** + * @} + */ + +/** + * @defgroup clk_perfmode_api APIs to specify core/bus clock frequency level within target voltage corner + * + * @{ + */ + +/** + * Method to specify core clock frequency level corresponding to the + * target corner request in the request structure intended for + * HAP_power_set for request type set to HAP_power_set_DCVS_v3. + * + * By default, the highest core clock frequency available at the requested + * target_corner is selected. Using this API, user can select either the + * highest (HAP_DCVS_CLK_PERF_HIGH) or the lowest (HAP_DCVS_CLK_PERF_LOW) + * core clock frequency at any given target_corner. If there is only one + * core clock frequency available at the requested target_corner, both the + * high and low settings will select the same. + * + * Note: Request type should be set to HAP_power_set_DCVS_v3. + * + * Supported on latest chipsets(released after Palima). + * + * @param[in] request Pointer to request structure. + * + * @param[in] perf_mode Perf mode to specify core clock frequency level + * within target voltage corner. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * AEE_EVERSIONNOTSUPPORT if unsupported. + */ +static inline int HAP_set_dcvs_v3_core_perf_mode( + HAP_power_request_t* request, + HAP_dcvs_clk_perf_mode_t perf_mode) +{ + if (sysmon_set_dcvs_v3_core_perf_mode) + { + return sysmon_set_dcvs_v3_core_perf_mode( + request, + perf_mode); + } + + return AEE_EVERSIONNOTSUPPORT; +} + +/** + * Method to specify bus clock frequency level corresponding to the + * target corner request in the request structure intended for + * HAP_power_set for request type set to HAP_power_set_DCVS_v3. + * + * By default, the highest bus clock frequency available at the requested + * target_corner is selected. Using this API, user can select either the + * highest (HAP_DCVS_CLK_PERF_HIGH) or the lowest (HAP_DCVS_CLK_PERF_LOW) + * bus clock frequency at any given target_corner. If there is only one + * bus clock frequency available at the requested target_corner, both the + * high and low settings will select the same. + * + * Note: Request type should be set to HAP_power_set_DCVS_v3. + * + * Supported on latest chipsets(released after Palima). + * + * @param[in] request Pointer to request structure. + * + * @param[in] perf_mode Perf mode to specify bus clock frequency level + * within target voltage corner. + * + * @return + * 0 upon success. \n + * Nonzero upon failure. \n + * AEE_EVERSIONNOTSUPPORT if unsupported. + */ +static inline int HAP_set_dcvs_v3_bus_perf_mode( + HAP_power_request_t* request, + HAP_dcvs_clk_perf_mode_t perf_mode) +{ + if (sysmon_set_dcvs_v3_bus_perf_mode) + { + return sysmon_set_dcvs_v3_bus_perf_mode( + request, + perf_mode); + } + + return AEE_EVERSIONNOTSUPPORT; +} + +/** + * @} + */ + +#ifdef __cplusplus +} +#endif + +#endif //HAP_DCVS_H_ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.md new file mode 100755 index 0000000000000..e957258e4e664 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.md @@ -0,0 +1,146 @@ +# DCVS Helper APIs + +DCVS Duty Cycle Helper APIs with usage examples. + +## DCVS Duty Cycle Helper APIs + +Header file: @b HAP_dcvs.h + +## Usage examples + +### HAP_set_dcvs_v3_duty_cycle + +This is the most straightforward and therefore the recommended simplified API to enable DCVS Duty Cycle. + +The user has to pass the context (power client identifier to be used in a HAP_power_set call) to this API. +The function calls into HAP_power_set API with the provided context and selects DCVS duty cycle mode via `HAP_power_set_DCVS_v3` request type. +The user optionally can provide max_active_time and periodicity. The DCVS algorithm selects appropriate operating levels to keep the activity time within the provided +maximum allowed time and uses periodicity as a hint while predicting activity. + +The user does not need to specify any clock corners. Instead, the DCVS algorithm will select the appropriate clock corner with the best performance-power tradeoff that keeps the active time under the maximum value provided by the user for a given period. + +The example below demonstrates the usage of HAP_set_dcvs_v3_duty_cycle API. + +@code + /* + * Enabling DCVS Duty Cycle with 10ms max_active_time and 33ms periodicity + */ + HAP_set_dcvs_v3_duty_cycle(context, 10, 33); +@endcode + +Here DCVS Duty cycle starts with NOM as active corner and LOW SVS (SVS2) corner for idle cycle. Then, if the DCVS algorithm observes an active time longer than 10 ms (the user-defined max active time), it will increase the clock to the next level, NOM PLUS, to try bringing the active time under 10ms. + +![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle.png) + +### HAP_set_dcvs_v3_duty_cycle_params + +This API is useful in setting the max_active_time and periodicity in an existing DCVS request structure. + +The user can set the DCVS params as per application requirement in DCVS request structure with request type set to `HAP_power_set_DCVS_v3` and pass it as an argument to this function. + +This API allows the user to set the maximum active time and period values used by the DCVS algorithm. After invoking this function, the user will have to call HAP_power_set() using the same request structure. + +@code + HAP_power_request_t request; + request.type = HAP_power_set_DCVS_v3; + /* + * Selecting Duty Cycle mode with DCVS enabled + */ + request.dcvs_v3.set_dcvs_enable = TRUE; + request.dcvs_v3.dcvs_enable = TRUE; + request.dcvs_v3.dcvs_option = HAP_DCVS_V2_DUTY_CYCLE_MODE; + /* + * Setting TURBO PLUS as Max corner, NOM PLUS as Target corner + * and LOW SVS as Min corner + */ + request.dcvs_v3.set_core_params = TRUE; + request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_SVS2; + request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS; + request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_NOM_PLUS; + request.dcvs_v3.set_bus_params = TRUE; + request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS2; + request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS; + request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM_PLUS; + /* + * Setting 20ms max_active_time and 33ms periodicity + */ + HAP_set_dcvs_v3_duty_cycle_params(&request, 20, 33); + HAP_power_set(context, &request); +@endcode + +Here DCVS duty cycle apply LOW SVS (SVS2) for idle cycle and active cycle corner in the range of Max to Target (TURBO PLUS to NOM PLUS) to maintain the user given max_active_time (20ms). + +![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle_params.png) + +### HAP_set_dcvs_v3_core_perf_mode + +This API helps to select core clock frequency level within target voltage corner. + +By default, the highest core clock frequency available at the requested target corner is selected. Using this API, the user can select either the highest (`HAP_DCVS_CLK_PERF_HIGH`) or the lowest (`HAP_DCVS_CLK_PERF_LOW`) core clock frequency at any given target corner. If there is only one core clock frequency available at the requested target corner, both the +high and low settings will select the same. + +The user can set the DCVS params as per application requirement in DCVS request structure with request type set to `HAP_power_set_DCVS_v3` and pass the same as an arguement to this function along with perf_mode arguement which specifies the core clock frequency level (`HAP_DCVS_CLK_PERF_HIGH/HAP_DCVS_CLK_PERF_LOW`). + +This API sets the user provided perf_mode for core clock in the given request structure. After invoking this function, the user will have to call HAP_power_set() using the same request structure. + +@code + HAP_power_request_t request; + request.type = HAP_power_set_DCVS_v3; + /* + * Setting TURBO as Max corner, NOM as Target corner + * and LOW SVS as Min corner for core clock + */ + request.dcvs_v3.set_core_params = TRUE; + request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_SVS2; + request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO; + request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_NOM; + /* + * Setting perf_mode as HAP_DCVS_CLK_PERF_LOW + */ + HAP_set_dcvs_v3_core_perf_mode(&request, HAP_DCVS_CLK_PERF_LOW); + HAP_power_set(context, &request); +@endcode + +Here DCVS will vote for minimum available core clock frequency at NOM target corner. + +### HAP_set_dcvs_v3_bus_perf_mode + +This API helps to select bus clock frequency level within target voltage corner. + +By default, the highest bus clock frequency available at the requested target corner is selected. Using this API, the user can select either the highest (`HAP_DCVS_CLK_PERF_HIGH`) or the lowest (`HAP_DCVS_CLK_PERF_LOW`) bus clock frequency at any given target corner. If there is only one bus clock frequency available at the requested target corner, both the high and +low settings will select the same. + +The user can set the DCVS params as per application requirement in DCVS request structure with request type set to `HAP_power_set_DCVS_v3` and pass the same as an arguement to this function along with perf_mode arguement which specifies the bus clock frequency level (`HAP_DCVS_CLK_PERF_HIGH/HAP_DCVS_CLK_PERF_LOW`). + +This API sets the user provided perf_mode for bus clock in the given request structure. After invoking this function, the user will have to call HAP_power_set() using the same request structure. + +@code + HAP_power_request_t request; + request.type = HAP_power_set_DCVS_v3; + /* + * Setting TURBO PLUS as Max corner, TURBO as Target corner + * and LOW SVS as Min corner for bus clock + */ + request.dcvs_v3.set_bus_params = TRUE; + request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS2; + request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS; + request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_TURBO; + /* + * Setting perf_mode as HAP_DCVS_CLK_PERF_LOW + */ + HAP_set_dcvs_v3_bus_perf_mode(&request, HAP_DCVS_CLK_PERF_LOW); + HAP_power_set(context, &request); +@endcode + +Here DCVS will vote for minimum available bus clock frequency at TURBO target corner. + +### HAP_set_dcvs_v3_protected_bus_corners + +On chipsets supporting bus corners above `HAP_DCVS_VCORNER_TURBO_PLUS`, to optimize residency at these corners, target corner requests for bus are capped to `HAP_DCVS_VCORNER_TURBO_PLUS` by default. +Any request beyond `HAP_DCVS_VCORNER_TURBO_PLUS` (including `HAP_DCVS_VCORNER_MAX`) will be set to `HAP_DCVS_VCORNER_TURBO_PLUS`. + +This API enables clients of HAP_power_set to override this protection when voting explicitly for bus corners above `HAP_DCVS_VCORNER_TURBO_PLUS` in necessary use cases. + +Note: +This API is supported starting with V79 QDSP6 architecture, `AEE_EVERSIONNOTSUPPORT` error (can be safely ignored) is returned by the API when not supported. +Request type should be set to `HAP_power_set_DCVS_v3`. diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_debug.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_debug.h new file mode 100755 index 0000000000000..aeed83ffcac27 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_debug.h @@ -0,0 +1,81 @@ +#ifndef HAP_DEBUG_H +#define HAP_DEBUG_H +/*============================================================================== + Copyright (c) 2012-2013 Qualcomm Technologies, Inc. + All rights reserved. Qualcomm Proprietary and Confidential. +==============================================================================*/ + +#include "AEEStdDef.h" +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAP_LEVEL_LOW 0 +#define HAP_LEVEL_MEDIUM 1 +#define HAP_LEVEL_HIGH 2 +#define HAP_LEVEL_ERROR 3 +#define HAP_LEVEL_FATAL 4 + +#define HAP_LEVEL_RUNTIME (1 << 5) + +//Add a weak reference so shared objects work with older images +#pragma weak HAP_debug_v2 + +//Add a weak reference for enabling FARF in autogen stub files +#pragma weak HAP_debug + +//Add a weak reference so runtime FARFs are ignored on older images +#pragma weak HAP_debug_runtime + +/************************************************************************** + These HAP_debug* functions are not meant to be called directly. + Please use the FARF() macros to call them instead +**************************************************************************/ +void HAP_debug_v2(int level, const char* file, int line, const char* format, ...); +void HAP_debug_runtime(int level, const char* file, int line, const char* format, ...); +int HAP_setFARFRuntimeLoggingParams(unsigned int mask, const char* files[], + unsigned short numberOfFiles); + +// Keep these around to support older shared objects and older images +void HAP_debug(const char *msg, int level, const char *filename, int line); + +static __inline void _HAP_debug_v2(int level, const char* file, int line, + const char* format, ...){ + char buf[256]; + va_list args; + va_start(args, format); + vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + HAP_debug(buf, level, file, line); +} + +/*! +This function is called to log an accumlated log entry. If logging is +enabled for the entry by the external device, then the entry is copied +into the diag allocation manager and commited. + + [in] log_code_type ID of the event to be reported + [in] *data data points to the log which is to be submitted + [in] dataLen The length of the data to be logged. + +Returns + TRUE if log is submitted successfully into diag buffers + FALSE if there is no space left in the buffers. + +*/ +boolean HAP_log_data_packet(unsigned short log_code_type, unsigned int dataLen, + byte* data); + +#define HAP_DEBUG_TRACEME 0 + +long HAP_debug_ptrace(int req, unsigned int pid, void* addr, void* data); + +#ifdef __cplusplus +} +#endif + +#endif // HAP_DEBUG_H + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.h new file mode 100755 index 0000000000000..4039d7822b331 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.h @@ -0,0 +1,98 @@ +/*----------------------------------------------------------------------- + Copyright (c) 2022 QUALCOMM Technologies, Incorporated. + All Rights Reserved. + QUALCOMM Proprietary. +-----------------------------------------------------------------------*/ + +/** + * @file HAP_etm_config.h + * @brief Header file with APIs to enable/disable etm tracing + */ + +#include "AEEStdErr.h" + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * @cond DEV + */ + +int __attribute__((weak)) __HAP_user_etm_enable(void); +int __attribute__((weak)) __HAP_user_etm_disable(void); + +/** + * @endcond + */ + + +/** @defgroup helperapi Helper APIs to enable/disable etm trace. + * API for users to enable or disable ETM tracing. + * The HAP user ETM API provides user capability to start/stop + * ETM tracing in a user module to cover a desired portion of + * execution. This API is disabled by default and will return + * an error when in that mode. To enable it, use + * --hap_etm_enable option of sysMonApp etmTrace service as + * mentioned in the sample command for default subsystem CDSP below: + * ``` + * adb shell /data/local/tmp/sysMonApp etmTrace --command etm --hap_etm_enable 1 + * ``` + * ETM enablement requires setting up coresight driver on HLOS + * and configuring appropriate ETM trace type on Q6 subsystem. + * ETM configurations set via sysMonApp etmTrace option + * like etm tracing mode (cycle accurate PC tracing etc., + * sample command on CDSP below) + * ``` + * adb shell /data/local/tmp/sysMonApp etmTrace --command etm --etmType ca_pc + * ``` + * are preserved across HAP user etm enable and disable calls. + * The API is only for debug purpose and shouldn't be used in + * production environments. + * @{ + */ + +/** + * Requests ETM tracing to be enabled + * + * Call this function from the DSP user process to start ETM + * tracing. To stop the tracing, call @ref HAP_user_etm_disable(). + * Supported on latest chipsets(released after Palima). + * @param None + * @return 0 upon success, other values upon failure. + */ +static inline int HAP_user_etm_enable(void) { + if(__HAP_user_etm_enable) + return __HAP_user_etm_enable(); + return AEE_EVERSIONNOTSUPPORT; +} + +/** + * Requests ETM tracing to be disabled + * + * Call this function from the DSP user process to stop any active + * ETM tracing. API returns error if there is no active ETM trace + * enable call, e.g., if @ref HAP_user_etm_disable() is called + * first without any active @ref HAP_user_etm_enable() being + * present. The enable and disable requests are reference counted + * in the driver. Nested calls are supported, e.g. + * if @ref HAP_user_etm_enable() is called twice, two calls + * to the disable API @ref HAP_user_etm_disable() will be needed + * to disable the tracing. + * Supported on latest chipsets(released after Palima). + * @param None + * @return 0 upon success, other values upon failure. + */ +static inline int HAP_user_etm_disable(void) { + if(__HAP_user_etm_disable) + return __HAP_user_etm_disable(); + return AEE_EVERSIONNOTSUPPORT; +} + +/** + * @} + */ + +#ifdef __cplusplus +} +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.md new file mode 100755 index 0000000000000..debf8e3ccc999 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.md @@ -0,0 +1,38 @@ +# ETM Trace enable/disable APIs + +The HAP ETM framework exposes a set of APIs to enable/disable +ETM tracing in a user module to trace a region of interest +provided ETM tracing is configured. + +For configuring ETM tracing, refer to "profile on device" section +of Profiling example in base SDK. + +After ETM tracing is configured, the API requires setting the +'--hap_etm_enable' flag via sysMonApp etmTrace option as below: +``` +adb shell /data/local/tmp/sysMonApp etmTrace --command etm --hap_etm_enable 1 +``` + +After ETM trace collection, this flag should be reset with the +command: +``` +adb shell /data/local/tmp/sysMonApp etmTrace --command etm --hap_etm_enable 0 +``` + +Call to the APIs are ignored in the following cases: +* ETM tracing is not configured. +* The '--hap_etm_enable' flag is set to 0. + +***NOTE:*** The APIs work only on debug enabled device. +A test device or debug device, (Mobile Test Platform) MTP +or (Qualcomm Reference Design) QRD, is a device on which +the debug fuse is present. This fuse is not present on +production devices. + +## Supported chipsets + +Beyond Palima + +## Framework APIs + +Header file: @b HAP_etm_config.h \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.h new file mode 100755 index 0000000000000..8f5d3ba9aa38c --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.h @@ -0,0 +1,264 @@ +/*============================================================================== + Copyright (c) 2012-2013, 2020 Qualcomm Technologies, Inc. + All rights reserved. Qualcomm Proprietary and Confidential. +==============================================================================*/ + +#ifndef HAP_FARF_H +#define HAP_FARF_H + +/** + * @file HAP_farf.h + * @brief FARF API + */ + +#include "AEEStdDef.h" +#include "HAP_debug.h" + +/** + *\def FARF() + * FARF is used to log debug messages from DSP + * + * `Compile time logging options:` + * + * Logging is controlled via conditional compilation. + * The FARF level allows the user to selectively enable or disable certain types + * of messages according to their priority level. + * The following levels are supported and listed in increasing priority: + * + * LOW + * + * MEDIUM + * + * HIGH + * + * ERROR + * + * FATAL + * + * ALWAYS + * + * A FARF level should be defined to 1 for FARF macros to be compiled + * in. For example: + * + * @code + * #define FARF_LOW 1 + * #include "HAP_farf.h" + * + * FARF(LOW, "something happened: %s", (const char*)string); + * + * @endcode + * + * FARF_LOW, FARF_MEDIM, FARF_HIGH are defined to 0 and FARF_ERROR, + * FARF_FATAL, FARF_ALWAYS are defined to 1 by default. + * + * If FARF_LOW is defined to 0, as it is by default, the above + * FARF string will not be compiled in, if it is defined to 1 it + * will be compiled in. + * + * If both HIGH and LOW messages are used but only FARF_LOW is defined + * as shown in below example then only LOW message will be compiled in and sent to DIAG. + * + * @code + * #define FARF_LOW 1 + * #include "HAP_farf.h" + * + * FARF(LOW, "LOW message"); + * FARF(HIGH, "HIGH message"); // This message will not be compiled in + * + * @endcode + * + * Messages logged with ALWAYS level are always compiled in and logged. + * + * When building the Debug variant or builds defining _DEBUG the + * following FARF levels will be enabled: + * + * HIGH + * + * ERROR + * + * FATAL + * + * `Run time logging options:` + * + * In order to enable run-time logging (logging that can be enabled / disabled + * at run-time), the FARF_RUNTIME_* macros should be used. + * + * Log messages sent with these macros are compiled in by default. However by + * these messages WILL NOT be logged by default. In order to enable logging, + * the FASTRPC process will need to either call the + * HAP_SetFARFRuntimeLoggingParams() API, or by adding a ``.farf + * file to the HLOS file system with the appropriate contents. + * + * @code + * + * #include "HAP_farf.h" + * FARF(RUNTIME_HIGH, "something happened: %s", (const char*)string); + * + * @endcode + * + * @param[in] x the FARF level defined to either 0 to disable compilation or 1 to enable. + * @param[in] ... the format string and arguments. + */ +#define FARF(x, ...) _FARF_PASTE(_FARF_,_FARF_VAL(FARF_##x))(x, ##__VA_ARGS__) + + +/** +* @defgroup static_FARF Compile-time macros +* +* Set these compile time macros to 1 to enable logging at that +* level. Setting them to 0 will cause them to be COMPILED out. +* +* Usage Example: +* @code +* +* #define FARF_HIGH 1 +* FARF(HIGH,"Log message"); +* +* @endcode + +* The ALWAYS macro will cause log messages to be ALWAYS compiled in. +* @code +* +* FARF(ALWAYS,"Log message") +* +* @endcode +* +* Defining _DEBUG macro turns on ALWAYS, HIGH, ERROR, FATAL +*/ +/* @{ */ + +#ifdef _DEBUG +#ifndef FARF_HIGH +#define FARF_HIGH 1 +#endif +#endif + +/** + * The FARF_ALWAYS macro causes log messages to be ALWAYS compiled in + */ +#ifndef FARF_ALWAYS +#define FARF_ALWAYS 1 +#endif + +/** + * The FARF_LOW macro causes log messages to be compiled in when FARF_LOW is defined to 1 +*/ +#ifndef FARF_LOW +#define FARF_LOW 0 +#endif + +/** +* The FARF_MEDIUM macro causes log messages to be compiled in when FARF_MEDIUM is defined to 1 +*/ +#ifndef FARF_MEDIUM +#define FARF_MEDIUM 0 +#endif + +/** +* The FARF_HIGH macro causes log messages to be compiled in when FARF_HIGH is defined to 1 +*/ +#ifndef FARF_HIGH +#define FARF_HIGH 0 +#endif + +/** +* The FARF_ERROR macro causes log messages to be compiled in when FARF_ERROR is defined to 1 +*/ +#ifndef FARF_ERROR +#define FARF_ERROR 1 +#endif + +/** +* The FARF_FATAL macro causes log messages to be compiled in when FARF_FATAL is defined to 1 +*/ +#ifndef FARF_FATAL +#define FARF_FATAL 1 +#endif + +//! @cond Doxygen_Suppress +#define FARF_ALWAYS_LEVEL HAP_LEVEL_HIGH +#define FARF_LOW_LEVEL HAP_LEVEL_LOW +#define FARF_MEDIUM_LEVEL HAP_LEVEL_MEDIUM +#define FARF_HIGH_LEVEL HAP_LEVEL_HIGH +#define FARF_ERROR_LEVEL HAP_LEVEL_ERROR +#define FARF_FATAL_LEVEL HAP_LEVEL_FATAL +//! @endcond + +/* @} */ + + +/** +* @defgroup Runtime_FARF Runtime macros +* +* Runtime FARF macros can be enabled at runtime. +* They are turned OFF by default. +* +* Usage Example: +* @code +* +* FARF(RUNTIME_HIGH,"Log message"); +* +* @endcode +*/ +/* @{ */ +//! @cond Doxygen_Suppress +#ifndef FARF_RUNTIME_LOW +#define FARF_RUNTIME_LOW 1 +#endif +#define FARF_RUNTIME_LOW_LEVEL (HAP_LEVEL_RUNTIME | HAP_LEVEL_LOW) + +#ifndef FARF_RUNTIME_MEDIUM +#define FARF_RUNTIME_MEDIUM 1 +#endif +#define FARF_RUNTIME_MEDIUM_LEVEL (HAP_LEVEL_RUNTIME | HAP_LEVEL_MEDIUM) + +#ifndef FARF_RUNTIME_HIGH +#define FARF_RUNTIME_HIGH 1 +#endif +#define FARF_RUNTIME_HIGH_LEVEL (HAP_LEVEL_RUNTIME | HAP_LEVEL_HIGH) + +#ifndef FARF_RUNTIME_ERROR +#define FARF_RUNTIME_ERROR 1 +#endif +#define FARF_RUNTIME_ERROR_LEVEL (HAP_LEVEL_RUNTIME | HAP_LEVEL_ERROR) + +#ifndef FARF_RUNTIME_FATAL +#define FARF_RUNTIME_FATAL 1 +#endif +#define FARF_RUNTIME_FATAL_LEVEL (HAP_LEVEL_RUNTIME | HAP_LEVEL_FATAL) +//! @endcond +/* @} */ + + +//! @cond Doxygen_Suppress + +#define _FARF_PASTE(a,b) _FARF_PASTE_(a,b) +#define _FARF_PASTE_(a,b) a##b +#define _FARF_VAL(a) a + + +#define _FARF_0(x, ...) + +#ifndef __FILENAME__ +#define __FILENAME__ __FILE__ +#endif + +#define _FARF_1(x, ...) \ + do { \ + if(0 == (HAP_debug_v2)) { \ + _HAP_debug_v2(FARF_##x##_LEVEL, __FILENAME__, __LINE__, ##__VA_ARGS__); \ + } else { \ + if (FARF_##x##_LEVEL & HAP_LEVEL_RUNTIME) { \ + if (0 != HAP_debug_runtime) { \ + HAP_debug_runtime(FARF_##x##_LEVEL ^ HAP_LEVEL_RUNTIME , __FILENAME__, __LINE__, ##__VA_ARGS__); \ + } else { \ + break; \ + } \ + } else { \ + HAP_debug_v2(FARF_##x##_LEVEL, __FILENAME__, __LINE__, ##__VA_ARGS__); \ + } \ + } \ + } while (0) + +#endif /* #ifndef HAP_FARF_H */ +//! @endcond diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.md new file mode 100755 index 0000000000000..35111fa2dfb37 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.md @@ -0,0 +1,7 @@ +# HAP_farf + +## Overview + +The FARF API on DSP is used to generate diagnostic messages. These messages are sent to a diagnostic (or DIAG) framework on the DSP, from which they can be collected via USB using a tool called mini-dm running on the host computer. Parallelly, the DSP FARF messages can be routed to the application processor, allowing the user to collect DSP messages with logcat. These tools and the process for collecting messages is explained in the Messaging resources page from the SDK documentation. + +FARF messages can be enabled at compile-time and runtime. The Messaging resources page from the SDK documentation explains in detail the differences between compile-time and runtime FARF messages, how to enable them, and how to display them. diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.h new file mode 100755 index 0000000000000..0b6bae4d8336b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.h @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2012-2020 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc + */ + +#ifndef HAP_MEM_H +#define HAP_MEM_H +#include +#include "AEEStdDef.h" +#include "AEEStdErr.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file HAP_mem.h + * @brief HAP Memory APIs + */ + + +/* + * Protections are chosen from these bits, or-ed together + */ + + + /*! @name HAP_PROT + \brief These macros define the permissions on memory block described by the file descriptor. + + It is passed as input parameter 'prot' to HAP_mmap(). These can be ORed to set the required permissions. + + +*/ + +///@{ + +/*! \def HAP_PROT_NONE + \brief Passing HAP_PROT_NONE as input results in setting 'NO' permissions on the buffer. +*/ +#define HAP_PROT_NONE 0x00 /* no permissions */ + +/*! \def HAP_PROT_READ + \brief Passing HAP_PROT_READ as input results in setting 'Read' permissions on the buffer. +*/ +#define HAP_PROT_READ 0x01 /* pages can be read */ +/*! \def HAP_PROT_WRITE + \brief Passing HAP_PROT_WRITE as input results in setting 'Write' permissions on the buffer. +*/ + +#define HAP_PROT_WRITE 0x02 /* pages can be written */ + +/*! + \def HAP_PROT_EXEC + \brief Passing HAP_PROT_EXEC as input results in setting 'Execute' permissions on the buffer. Currently not supported. +*/ +#define HAP_PROT_EXEC 0x04 /* pages can be executed */ + + +///@} + +/* + * Cache policy or-ed with protections parameter + */ + + /*! @name HAP_MEM_CACHE + \brief These macros define the cache policies for mapping memory pages to DSP MMU. Default cache policy is cache writeback. + + It is passed as input parameter 'prot', or-ed with page protections to HAP_mmap(). + +*/ + +///@{ + +/*! \def HAP_MEM_CACHE_WRITEBACK + \brief Passing HAP_MEM_CACHE_WRITEBACK as input results in mapping memory as cache writeback +*/ +#define HAP_MEM_CACHE_WRITEBACK (0x10) /* cache write back */ + +/*! \def HAP_MEM_CACHE_NON_SHARED + \brief Passing HAP_MEM_CACHE_NON_SHARED as input results in mapping memory as uncached +*/ +#define HAP_MEM_CACHE_NON_SHARED (0x20) /* normal uncached memory */ + +/*! \def HAP_MEM_CACHE_WRITETHROUGH + \brief Passing HAP_MEM_CACHE_WRITETHROUGH as input results in mapping memory as cache write through +*/ + +#define HAP_MEM_CACHE_WRITETHROUGH (0x40) /* write through memory */ + +///@} + +/*! @name HAP_MEM_FLAGS + \brief These macros define the buffer attribute flags for allocating APPS memory from the DSP. + + It is passed as input parameter 'flags' to HAP_apps_mem_request(). + +*/ + +///@{ + +/*! \def HAP_MEM_FLAGS_SKIP_DSP_MAP + \brief Allocate memory on HLOS but skip DSP mapping +*/ + +#define HAP_MEM_FLAGS_SKIP_DSP_MAP 0 + +/*! \def HAP_MEM_FLAGS_DSP_MAP + \brief Allocate memory on HLOS and map on DSP +*/ + +#define HAP_MEM_FLAGS_DSP_MAP 1 + +/*! \def HAP_MEM_FLAGS_EXTENDED_MAP + \brief Allocate memory on HLOS and map beyond 4GB virtual address range on DSP. + + Unsupported currently. Reserved for future use. +*/ + +#define HAP_MEM_FLAGS_EXTENDED_MAP 2 + +/*! \def HAP_MEM_FLAGS_MAX + \brief Max number of flags supported by HAP_apps_mem_request +*/ + +#define HAP_MEM_FLAGS_MAX (HAP_MEM_FLAGS_EXTENDED_MAP + 1) + +///@} + +/** + * Allocate a block of memory. + * @param[in] bytes size of memory block in bytes. + * @param[out] pptr pointer to the memory block + * @return int AEE_SUCCESS for success and AEE_ENOMEMORY for failure. + */ + +static inline int HAP_malloc(uint32 bytes, void** pptr) +{ + *pptr = malloc(bytes); + if (*pptr) { + return AEE_SUCCESS; + } + return AEE_ENOMEMORY; +} + +/** + * Free the memory block allocated through HAP_malloc(). + * @param[in] ptr pointer to the memory block + * @return int AEE_EBADCLASS if ptr is NULL + AEE_SUCCESS if ptr is not NULL + + */ + +static inline int HAP_free(void* ptr) +{ + if(ptr == NULL) + return AEE_EBADCLASS; + free(ptr); + return AEE_SUCCESS; +} + +/** Statistics of user heap memory */ +struct HAP_mem_stats { + uint64 bytes_free; /**< number of bytes free from all the segments, + * may not be available for a single alloc + */ + uint64 bytes_used; /**< number of bytes used */ + uint64 seg_free; /**< number of segments free */ + uint64 seg_used; /**< number of segments used */ + uint64 min_grow_bytes; /**< minimum number of bytes to grow the heap by when creating a new segment */ +}; + +/** + * @brief Enum for reqID for HAP_mem_get_heap_info() + */ +enum HAP_mem_stats_request { + USAGE_STATS = 1, + MAX_USED +}; + +/** + * @brief RequestID/Response for HAP_mem_get_heap_info + */ +typedef struct { + enum HAP_mem_stats_request req_id; + union { + struct HAP_mem_stats usage_stats; + unsigned long max_used; /* Peak heap usage */ + }; +} HAP_mem_heap_info_t; + +/** + * Get the current statistics from the heap. + * + * @param[in,out] stats pointer to stats structure + * @retval AEE_SUCCESS + */ +int HAP_mem_get_stats(struct HAP_mem_stats *stats); + +/** + * Get the heap info. + * + * @param payload, pointer to store the request/response + * @retval, 0 on success + */ +int HAP_mem_get_heap_info(HAP_mem_heap_info_t *payload); + +/** + * Enum to hold the START and END marker values + * + */ +typedef enum +{ + START = 0, + END +} marker_t; + +/** + * Request types: + * HAP_MEM_LOG_BLOCKS - to log all the blocks to csv + * file named: hprt_block_info_.csv + * + * HAP_MEM_SET_MARKER - to set markers for different instances. + * (2^16 instances are possible per application) + * + * HAP_MEM_MAP - to map buffer at random VA or reserved VA + * + * HAP_MEM_UNMAP - to unmap buffer + * + * HAP_RESERVE_VA - to reserve VA space on DSP without mapping + * + * HAP_UNRESERVE_VA - to unreserve VA space on DSP + */ +typedef enum +{ + HAP_MEM_LOG_BLOCKS = 1, + HAP_MEM_SET_MARKER = 2, + HAP_MEM_MAP = 3, + HAP_MEM_UNMAP = 4, + HAP_RESERVE_VA = 5, + HAP_UNRESERVE_VA = 6 +} HAP_mem_req_t; + +/** + * Payload structure for HAP_MEM_SET_MARKER request + * marker_type, START or END marker + * instance, incase of START - NOOP; if request is success, instance number. + * incase of END - instance number to find leaks + * + */ +typedef struct +{ + marker_t marker_type; + uint16_t instance; +} HAP_mem_marker_payload_t; + +/* Payload structure for HAP_MEM_MAP request */ +typedef struct { + uint64_t addr; // [in] reserved va (optional). If 0, buffer mapped at random VA + uint64_t len; // [in] length of buffer to be mapped + int prot; // [in] permissions and cache-mode of mapping + int flags; // [in] buffer flags + int fd; // [in] file descriptor of buffer + uint64_t dsp_pa; // [in] Offset + uint64_t dsp_va; // [out] Mapped DSP virtual address +} HAP_mem_map_t; + +/* Payload structure for HAP_MEM_UNMAP request */ +typedef struct { + uint64_t dsp_va; // [in] DSP VA to be unmapped + uint64_t len; // [in] length of mapping +} HAP_mem_unmap_t; + +/* Payload structure for HAP_RESERVE_VA request */ +typedef struct { + uint64_t len; // [in] Length of VA space to be reserved + int prot; // [in] Permissions of the VA space + int flags; // [in] flags (unused for now) + uint64_t dsp_va; // [out] Reserved DSP virtual address +} HAP_mem_reserve_t; + +/* Payload structure for HAP_UNRESERVE_VA request */ +typedef struct { + uint64_t dsp_va; // [in] DSP VA to be unreserved + uint64_t len; // [in] Length of buffer to be unreserved +} HAP_mem_unreserve_t; + +/** + * Payload for different requests + * New request payload structures should be + * added to the union. + */ +typedef struct +{ + HAP_mem_req_t request_id; + union { + HAP_mem_marker_payload_t mem_marker_payload; + HAP_mem_map_t mmap; + HAP_mem_unmap_t munmap; + HAP_mem_reserve_t reserve; + HAP_mem_unreserve_t unreserve; + }; +} HAP_mem_req_payload_t; + +/** + * Generic request API, which will decode request type + * and use the payload to parse the input and output + * for the request + * @param mem_payload- input and output payload for the request + * @retval 0 on success. + */ +int HAP_mem_request(HAP_mem_req_payload_t *mem_payload); + +/** + * Set the minimum and maximum grow size. + * + * This API allows to configure the minimum and maximum size that should + * be added to the DSP user heap when an allocation fails and more memory + * needs to be obtained from the HLOS. Using this API is optional. If not + * used, the runtime will try to choose reasonable growth sizes based on + * allocation history. + * + + * @param[in] min minimum bytes to grow the heap by when requesting a new segment + * @param[in] max maximum bytes to grow the heap by when requesting a new segment + * @retval AEE_SUCCESS + * + */ +int HAP_mem_set_grow_size(uint64 min, uint64 max); + +/** + * Set low and high memory thresholds for heap + * + * Thresholds must be tuned according to the memory requirements + * + * Improper thresholds might led to heap failure + * + * @param[in] low_largest_block_size (in bytes) - the heap will grow if size of the largest free block is less than this threshold. + * Currently, setting this parameter will have no impact on the heap. + * @param[in] high_largest_block_size (in bytes) - the heap manager will release all unused sections if size of the largest free block is greater than this threshold. + * The recommended value for this, is the size of largest single allocation possible in your application. + * @return AEE_SUCCESS on success + * AEE_EBADPARM on failure + */ +int HAP_mem_set_heap_thresholds(unsigned int low_largest_block_size, unsigned int high_largest_block_size); + + +/** + * Map buffer associated with the file descriptor to DSP memory. The reference + * count gets incremented if the file descriptor is already mapped. This API is + * limited to buffer size less then 2 GB. Recommendation is to use HAP_mmap2 for + * buffer of size > 2 power(8*sizeof(size_t)) + * + * @param[in] addr mapping at fixed address, not supported currently. This has to be set to NULL + * @param[in] len size of the buffer to be mapped + * @param[in] prot protection flags - supported are only HAP_PROT_READ and HAP_PROT_WRITE. HAP_PROT_EXEC is not supported + * @param[in] flags HAP_MAP_NO_MAP - Increment reference count with no mapping + * 0 - map the buffer and increment the reference count + * @param[in] fd file descriptor for the buffer + * @param[in] offset offset into the buffer + * @retval mapped address + * -1 on failure + */ +void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset); + +/** + * Map buffer associated with the file descriptor to DSP memory. The reference + * count gets incremented if the file descriptor is already mapped. + * + * @param[in] addr mapping at fixed address, not supported currently. This has to be set to NULL + * @param[in] len size of the buffer to be mapped + * @param[in] prot protection flags - supported are only HAP_PROT_READ and HAP_PROT_WRITE. HAP_PROT_EXEC is not supported + * @param[in] flags HAP_MAP_NO_MAP - Increment reference count with no mapping + * 0 - map the buffer and increment the reference count + * @param[in] fd file descriptor for the buffer + * @param[in] offset offset into the buffer + * @retval mapped address + * -1 on failure + */ +void* HAP_mmap2(void *addr, size_t len, int prot, int flags, int fd, long offset); + +/** + * Decrements the reference count and unmaps the buffer from memory if the reference count goes to 0. + * This API is used for buffer size less then 2 GB. Recommendation is to use HAP_munmap2 for buffer of + * size > 2 power(8*sizeof(size_t)). + * + * @param[in] addr mapped address + * @param[in] len size of the mapped buffer + * @return 0 on success + * AEE_NOSUCHMAP in input addr is invalid + */ +int HAP_munmap(void *addr, int len); + +/** + * Decrements the reference count and unmaps the buffer from memory if the reference count goes to 0. + * + * @param[in] addr mapped address + * @param[in] len size of the mapped buffer + * @return 0 on success + * AEE_NOSUCHMAP in input addr is invalid + */ +int HAP_munmap2(void *addr, size_t len); + +/** + * Get virtual and physical address associated with the buffer and increments + * the reference count. + * + * @param[in] fd file descriptor for the buffer + * @param[out] vaddr virtual address associated with the buffer + * @param[out] paddr physical address associated with the buffer + * @retval 0 on success + * AEE_ENOSUCHMAP if fd is invalid + */ +int HAP_mmap_get(int fd, void **vaddr, uint64 *paddr); + +/** + * Decrements the reference count of the file descriptor. + * + *@param[in] fd file descriptor of the buffer + *@retval 0 on success + * AEE_ENOSUCHMAP if fd is invalid + * AEE_EBADMAPREFCNT if map refcount is <=0 + */ +int HAP_mmap_put(int fd); + +/** + * Get the stack size (in bytes) available for current thread + * Supported only on Lahaina and Cedros + * @return available stack for current thread, on success + * AEE_EINVALIDTHREAD if unable to get current thread id + * AEE_ERESOURCENOTFOUND if unable to get stack for current thread + */ +uint64 HAP_mem_available_stack(void); + +/** + * Allocate and map APPS memory from DSP + * + * Usage of this API over malloc() is recommended when client wants greater control over DSP virtual address space + * as free() does not necessarily free the allocated memory depending on heap thresholds. + * HAP_apps_mem_request and HAP_apps_mem_release guarantee freeing of the allocated memory. + * + * @param[in] len size of memory to be allocated + * @param[in] flags Buffer attribute flags HAP_MEM_FLAGS_SKIP_DSP_MAP, HAP_MEM_FLAGS_DSP_MAP or HAP_MEM_FLAGS_EXTENDED_MAP + * @param[out] fd file descriptor of buffer + * @param[out] dsp_va DSP mapped virtual address + * @return 0 on success + */ +int HAP_apps_mem_request(size_t len, uint32_t flags, int *fd, uint64_t *dsp_va); + +/** + * Release previously allocated APPS memory from DSP. + * Releases memory from HLOS. Also unmaps memory from DSP + * if HAP_MEM_FLAGS_DSP_MAP was previously passed while + * requesting memory. + * + * @param[in] fd previously returned file descriptor of buffer + * @return 0 on success + */ +int HAP_apps_mem_release(int fd); + +#ifdef __cplusplus +} +#endif + +#endif // HAP_MEM_H + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.md new file mode 100755 index 0000000000000..42be2ab6cba12 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.md @@ -0,0 +1,75 @@ +# HAP_mem APIs + +## Overview +The HAP_mem APIs provide functionality available from the DSP to + +* allocate and free memory - HAP_malloc() and HAP_free() +* map and unmap ION buffers allocated on the application processor and passed to the DSP using file descriptors - HAP_mmap() and HAP_munmap() +* get heap statistics and set properties - HAP_mem_get_stats(), HAP_mem_set_grow_size(), HAP_mmap_get(), HAP_mem_set_heap_thresholds() and HAP_mmap_put() +* allocate and free APPS memory - HAP_apps_mem_request() and HAP_apps_mem_free() + +## Memory mapping + +A common usage scenario for using the mapping functionality consists of the application processor allocating ION memory and passing the file descriptor to the DSP. +The DSP will then use the HAP_mem APIs to map the buffer onto the DSP and obtain a memory pointer. The mapping will remain valid until the buffer is being unmapped. +This approach allows to maintain a mapping across multiple FastRPC calls. + +## Memory allocation + +HAP_malloc and HAP_free are simple wrappers around the DSP malloc and free functions. +If a user memory allocation request cannot be fulfilled with the existing DSP heap, the FastRPC +runtime will attempt to grow the DSP heap by reserving additional memory from the HLOS. + +The HAP_set_grow_size API can be called to configure the minimum and maximum size that should be added to the DSP heap when one of these growth events occurs. +If many growth events are anticipated, it may be appropriate to set a larger growth rate to reduce the number of growth events. However, increasing +the heap more than necessary will impact HLOS performance. Therefore, care must be taken in finding the appropriate growth rate for a given application. + +Here is how the min and max values set by the HAP_set_grow_size control the growth of the heap: + + min_grow_bytes = MIN(max,MAX(min,min_grow_bytes)); + + // The value will be aligned to the next 1MB boundary. + + actual_grow_bytes = min_grow_bytes + request_size + actual_grow_bytes = ALIGN(actual_grow_bytes,0x100000) + +`HAP_apps_mem_request()` and `HAP_apps_mem_release()` APIs can be called from the DSP to allocate APPS memory and map the same memory on the DSP if required. + +These HAP request and release APIs are recommended when the user wants greater control over the DSP virtual address space: unlike `malloc` and `free`, these APIs guarantee that the memory will be mapped when allocated and unmapped when freed. + +The mapping on the DSP can be controlled using the `flags` parameter in `HAP_apps_mem_request()`: + + * `HAP_MEM_FLAGS_SKIP_DSP_MAP` results in skipping the mapping on the DSP. In that case, the user needs to map the DSP memory by calling `HAP_mmap()`. + + * `HAP_MEM_FLAGS_DSP_MAP` results in mapping the buffer on the DSP upon calling `HAP_apps_mem_request()`. + +`HAP_apps_mem_release()` will always free the allocated HLOS memory but will only unmap the buffer on the DSP if the flag `HAP_MEM_FLAGS_DSP_MAP` was used when calling `HAP_apps_mem_request()`. + +***NOTE*** +If HAP_MEM_FLAGS_SKIP_DSP_MAP flag was used when calling `HAP_apps_mem_request()`, and the memory was mapped later using `HAP_mmap()`, then the user needs to unmap DSP memory by calling `HAP_munmap()`. + +## Memory statistics + +HAP_mem_get_stats is useful when called at the beginning and end of an application to check for any memory leaks. + +## Memory request API +HAP_mem_request is the request API, which support different request types. Requests supported are: + +* `HAP_MEM_LOG_BLOCKS`: This request will log all the heap blocks to the csv file named - hprt_block_info_.csv, for parsing use QMemCheck tool. + If block info logging is successful - 0 will be returned back by the HAP_mem_request. This request doesn't need any payload union. +* `HAP_MEM_SET_MARKER`: This request is to mark instances for leak detection, the markers can be START or END markers. + When START marker is called, a marker instance number will be returned back to caller of the API (if the request is SUCCESS(0)) in the payload member: + mem_marker_payload. + When END marker is called, the caller should fill the instance number for which marker needs to be ended. If the request is success, + all the leaks from the START to END of that instance will be logged to hprt_leak_block_info__.csv +* `HAP_MEM_MAP`: This request is to create a DSP mapping for a shared buffer. + The payload structure for this request can be referred to in `HAP_mem_map_t`. To create the mapping at a reserved va, the start address needs to be specified in the `addr` field of the payload. + If the request is SUCCESS(0), the `dsp_va` member of payload will hold the mapped virtual address (VA). +* `HAP_MEM_UNMAP`: This request is to unmap the memory region on the DSP. + The payload structure for this request can be referred to in `HAP_mem_unmap_t`. The starting virtual address and length of buffer needs to passed as payload members `dsp_va` and `len`. + If the request is SUCCESS(0), the virtual address (VA) mapping is removed from the DSP. +* `HAP_RESERVE_VA`: This request is to reserve virtual address (VA) space on the DSP without creating any mappings. + The payload structure for this request can be referred to in `HAP_mem_reserve_t`. + If the request is SUCCESS(0), the `dsp_va` member of payload will hold the reserved virtual address (VA). +* `HAP_UNRESERVE_VA`: This request is to unreserve the virtual address (VA) space on the DSP. + The payload structure for this request can be referred to in `HAP_mem_unreserve_t`. If the request is SUCCESS(0), the virtual address space is successfully unreserved. diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_pd_dtor.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_pd_dtor.h new file mode 100755 index 0000000000000..e612ed135a908 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_pd_dtor.h @@ -0,0 +1,51 @@ +#ifndef HAP_PD_DTOR_H +#define HAP_PD_DTOR_H +/*============================================================================== + Copyright (c) 2015 Qualcomm Technologies Incorporated. + All Rights Reserved Qualcomm Technologies Proprietary + + Export of this technology or software is regulated by the U.S. + Government. Diversion contrary to U.S. law prohibited. +==============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * This type is used to provide the qdi driver the register address, and the + * bits of the register to clear. + * + * @param register_addr, The register address whose value needs to modified on process exit + * @param register_mask, A mask that indicates which bits of the register need to be set. + * @param register_val, The value that needs to be applied to the unmasked bits. + */ +typedef struct { + uintptr_t register_addr; + uint32 register_mask; + uint32 register_value; +}HAP_register_t; + +/** + * A fastrpc process can call this method and provide a list of register addresses + * and their desired values. When the fastrpc process exits, a previous call to this + * method will ensure that the bits (defined in the bitmask) for the provided + * registers are cleared. + + * @param num_registers, Number of registers that need to be modified on process exit + * @param registers, The register address and masks. + */ +int HAP_clear_registers(unsigned int num_registers, HAP_register_t* registers); + +/** + * This method is used by the kernel to free any memory that + * a fastrpc client might have line-locked +*/ +int HAP_free_linelocked_memory(unsigned int asid); + +#ifdef __cplusplus +} +#endif + +#endif /*HAP_PD_DTOR_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.h new file mode 100755 index 0000000000000..1f9da76d8c785 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.h @@ -0,0 +1,120 @@ +/*============================================================================== +@file + HAP_perf.h + +@brief + Header file for DSP Perf APIs + +Copyright (c) 2012-2017, 2020 QUALCOMM Technologies, Incorporated. +All Rights Reserved. +QUALCOMM Proprietary. +==============================================================================*/ +#ifndef HAP_PERF_H +#define HAP_PERF_H + +#include "AEEStdDef.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup timer_functionality Timer functionality. + * @{ + */ + +/** + * Gets the current value of 56-bit, 19.2MHz hardware counter converted + * to micro-seconds. This value should be treated as relative and + * not absolute. The value wraps around to zero once it exceeds the + * maxiumum value. This function performs an integer division in order + * to convert ticks to time, which adds some overhead. Consider using + * HAP_perf_get_qtimer_count for a lower overhead. +*/ +#ifdef __hexagon__ +#include "hexagon_sim_timer.h" +static inline uint64 HAP_perf_get_time_us(void) +{ + /* Converts ticks into microseconds + 1 tick = 1/19.2MHz seconds + Micro Seconds = Ticks * 10ULL/192ULL */ + unsigned long long count; + asm volatile (" %0 = c31:30 " : "=r"(count)); + return (uint64)(count) * 10ull / 192ull; +} +#else +uint64 HAP_perf_get_time_us(void) +{ + static long long start = 0; + // TODO + // assume 500 MHz on simulator + //return HAP_perf_get_pcycles() / 500; + return start++; +} +#endif + +/** + * Gets the current 56 bit, 19.2MHz global hardware clock count. + * This value should be treated as relative and not absolute. + * The value wraps around to zero once it exceeds the maxiumum value. +*/ +static inline uint64 HAP_perf_get_qtimer_count(void) { + unsigned long long cur_count; + asm volatile(" %0 = c31:30 " : "=r"(cur_count)); + return (uint64)cur_count; +} + +/** + * Converts the 19.2 MHz global hardware count to micro-seconds. + * @param[in] count - 19.2 MHz global hardware count + * @returns - Time in micro-seconds. +*/ +uint64 HAP_perf_qtimer_count_to_us(uint64 count); + +/** + * Gets the current 64-bit Hexagon Processor cycle count. + * The processor cycle count is the current number of processor + * cycles executed since the processor was last reset. Note + * that this counter stops incrementing whenever the DSP enters + * a low-power state (such as clock gating), as opposed to the + * qtimer, which increments regardless of the DSP power state. +*/ +#ifdef __hexagon__ +#include "hexagon_sim_timer.h" +static inline uint64 HAP_perf_get_pcycles(void) +{ + uint64_t pcycle; + asm volatile ("%[pcycle] = C15:14" : [pcycle] "=r"(pcycle)); + return pcycle; +} +#else +uint64 HAP_perf_get_pcycles(void) +{ + return (uint64)0; +} +#endif + +/** + * @} + */ + +/** @defgroup sleep_functionality Sleep functionality. + * @{ + */ + +/** + * Suspends the calling thread from execution until the + * specified duration has elapsed. + * @param[in] sleep_duration: - Sleep duration in micro-seconds. + * @returns - returns 0 on success, non zero in error case. +*/ +int HAP_timer_sleep(unsigned long long sleep_duration); + +/** + * @} // sleep_functionality + */ + +#ifdef __cplusplus +} +#endif + +#endif // HAP_PERF_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.md new file mode 100755 index 0000000000000..39895f35a5699 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.md @@ -0,0 +1,23 @@ +# Introduction {#intro} + +The Hexagon SDK provides APIs to measure the elapsed time in both microseconds +and processor cycles(pcycles). + + +# API Overview {#api-overview} + +The HAP_perf APIs are used by clients for profiling their code when running on the DSP. The profiling +can be done in both microseconds and pcycles based on the needs. Morevover, the HAP_perf library +also provides sleep APIs to the clients. + +The HAP_perf APIs include the following functions: + +::HAP_perf_get_time_us + +::HAP_perf_get_qtimer_count + +::HAP_perf_qtimer_count_to_us + +::HAP_perf_get_pcycles + +::HAP_timer_sleep diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.h new file mode 100755 index 0000000000000..55138edb1871f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.h @@ -0,0 +1,825 @@ +/*============================================================================== +@file + HAP_power.h + +@brief + Header file of DSP power APIs. + +Copyright (c) 2015,2019,2022 Qualcomm Technologies, Inc. +All rights reserved. Qualcomm Proprietary and Confidential. +==============================================================================*/ + +#ifndef _HAP_POWER_H +#define _HAP_POWER_H + +#include "AEEStdErr.h" +#include "AEEStdDef.h" +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +//Add a weak reference so shared objects do not throw link error +#pragma weak HAP_power_destroy_client + +/** + * Low-power modes for HAP DCVS V3 interface, used in 'sleep_disable' param in DCVS_v3. + * + * In general, applications are expected to vote for their latency tolerance via the + * 'latency' parameter in DCVS_v3/DCVS_v2 options. The aggregated latency vote across + * clients is used in selecting appropriate low-power mode (LPM) of the DSP subsystem. + * LPM will save power when the DSP subsystem is idle by reducing leakage current. + * Deeper LPMs typically have higher wake up latencies, which will increase interrupt + * service delays and add to inter-processor communication latencies. Though the + * latency vote controls the selection of low-power modes, the vote required for + * disabling/allowing certain LPMs is difficult to calculate as the wakeup latency + * associated with these LPMs could change from chipset to chipset and between runs + * within the same chipset. + * + * This 'sleep_disable' parameter in DCVS_v3 allows user to directly prevent certain LPM + * levels of the DSP subsystem. By default, there is no restriction placed on LPMs i.e. + * all the LPMs are enabled and the aggregated latency vote (along with other system + * parameters) is used in LPM selection. The 'sleep_disable' parameter in DCVS_v3 is for + * the advanced developers who would like to disable certain low-power modes explicitly + * irrespective of the latency vote. Developers need to consider their power-performance + * tradeoff requirements and if necessary profile the results before voting using this + * parameter. Regular users are suggested to choose the default i.e. 'HAP_DCVS_LPM_ENABLE_ALL'. + * + * If any particular LPM level is not supported on the DSP subsystem then it will enable + * nearest shallow LPM level. For example, in absense of 'HAP_DCVS_LPM_LEVEL3' it will select + * 'HAP_DCVS_LPM_LEVEL2' which is nearest shallow LPM level to 'HAP_DCVS_LPM_LEVEL3'. + */ +#define HAP_DCVS_LPM_LEVEL1 1 /**< To disable all low-power modes */ +#define HAP_DCVS_LPM_LEVEL2 2 /**< To enable only standalone APCR */ +#define HAP_DCVS_LPM_LEVEL3 3 /**< To enable RPM assisted APCR */ +#define HAP_DCVS_LPM_ENABLE_ALL 0 /**< To enable all low-power modes (enables full power collapse) */ + +#define HAP_DCVS_VOLT_CORNER_TURBO_L2_L3_DEFINED /**< To indicate presence of L2 and L3 corners in HAP_dcvs_voltage_corner_t */ +#define HAP_POWER_SET_HMX_V2_DEFINED /**< To indicate support for HAP_power_set_HMX_v2 request type in HAP_power_set */ +#define HAP_POWER_SET_CENG_BUS_VOTING_DEFINED /**< To indicate support for HAP_power_set_CENG_bus request type in HAP power_set */ + +/** +* Possible error codes returned +*/ +typedef enum { + HAP_POWER_ERR_UNKNOWN = -1, + HAP_POWER_ERR_INVALID_PARAM = -2, + HAP_POWER_ERR_UNSUPPORTED_API = -3 +} HAP_power_error_codes; + +/** Payload for HAP_power_set_mips_bw */ +typedef struct { + boolean set_mips; /**< Set to TRUE to request MIPS */ + unsigned int mipsPerThread; /**< mips requested per thread, to establish a minimal clock frequency per HW thread */ + unsigned int mipsTotal; /**< Total mips requested, to establish total number of MIPS required across all HW threads */ + boolean set_bus_bw; /**< Set to TRUE to request bus_bw */ + uint64 bwBytePerSec; /**< Max bus BW requested (bytes per second) */ + unsigned short busbwUsagePercentage; /**< Percentage of time during which bwBytesPerSec BW is required from the bus (0..100) */ + boolean set_latency; /**< Set to TRUE to set latency */ + int latency; /**< maximum hardware wakeup latency in microseconds. The higher the value, + * the deeper state of sleep that can be entered but the longer it may take + * to awaken. Only values > 0 are supported (1 microsecond is the smallest valid value) */ +} HAP_power_mips_bw_payload; + +/** @defgroup HAP_power_enums HAP POWER enums + * @{ + */ + /** Clock frequency match type*/ +typedef enum { + HAP_FREQ_AT_LEAST, /**< Matches at least the specified frequency. */ + HAP_FREQ_AT_MOST, /**< Matches at most the specified frequency. */ + HAP_FREQ_CLOSEST, /**< Closest match to the specified frequency. */ + HAP_FREQ_EXACT, /**< Exact match with the specified frequency. */ + HAP_FREQ_MAX_COUNT /**< Maximum count. */ +} HAP_freq_match_type; +/** + * @} // HAP_power_enums + */ + +/** Configuration for bus bandwidth */ +typedef struct { + boolean set_bus_bw; /**< Set to TRUE to request bus_bw */ + uint64 bwBytePerSec; /**< Max bus BW requested (bytes per second) */ + unsigned short busbwUsagePercentage; /**< Percentage of time during which bwBytesPerSec BW is required from the bus (0..100) */ +} HAP_power_bus_bw; + +/** +* @brief Payload for vapps power request +* vapps core is used for Video post processing +*/ +typedef struct { + boolean set_clk; /**< Set to TRUE to request clock frequency */ + unsigned int clkFreqHz; /**< Clock frequency in Hz */ + HAP_freq_match_type freqMatch; /**< Clock frequency match */ + HAP_power_bus_bw dma_ext; /**< DMA external bus bandwidth */ + HAP_power_bus_bw hcp_ext; /**< HCP external bus bandwidth */ + HAP_power_bus_bw dma_int; /**< DMA internal bus bandwidth */ + HAP_power_bus_bw hcp_int; /**< HCP internal bus bandwidth */ +} HAP_power_vapss_payload; + +/** +* @brief Payload for vapps_v2 power request +* Supported in targets which have split VAPPS core(DMA and HCP) form Hana onwards +*/ +typedef struct { + boolean set_dma_clk; /**< Set to TRUE to reqeust DMA clock frequency */ + boolean set_hcp_clk; /**< Set to TRUE to reqeust HCP clock frequency */ + unsigned int dmaClkFreqHz; /**< DMA Clock frequency in Hz */ + unsigned int hcpClkFreqHz; /**< HCP Clock frequency in Hz */ + HAP_freq_match_type freqMatch; /**< Clock frequency match type */ + HAP_power_bus_bw dma_ext; /**< DMA external bus bandwidth */ + HAP_power_bus_bw hcp_ext; /**< HCP external bus bandwidth */ + HAP_power_bus_bw dma_int; /**< DMA internal bus bandwidth */ + HAP_power_bus_bw hcp_int; /**< HCP internal bus bandwidth */ +} HAP_power_vapss_payload_v2; + +/** Payload for HAP_power_set_HVX */ +typedef struct { + boolean power_up; /**< Set to TRUE to turn on HVX, and FALSE to turn off. */ +} HAP_power_hvx_payload; + +/** +* Payload for HAP_power_set_HMX +* Supported from Lahaina onwards*/ +typedef struct { + boolean power_up; /**< Set to TRUE to turn on HMX, and FALSE to turn off. + * When TRUE, on chipsets with separate HMX clock, a default + * HMX clock will be selected based on the voted + * Q6 core clock level from the same HAP_power_set context. + */ +} HAP_power_hmx_payload; + +/** @defgroup HAP_power_enums HAP POWER enums + * @{ + */ +/** Payload for HAP power client classes */ +typedef enum { + HAP_POWER_UNKNOWN_CLIENT_CLASS = 0x00, /**< Unknown client class */ + HAP_POWER_AUDIO_CLIENT_CLASS = 0x01, /**< Audio client class */ + HAP_POWER_VOICE_CLIENT_CLASS = 0x02, /**< Voice client class */ + HAP_POWER_COMPUTE_CLIENT_CLASS = 0x04, /**< Compute client class */ + HAP_POWER_STREAMING_1HVX_CLIENT_CLASS = 0x08, /**< Camera streaming with 1 HVX client class */ + HAP_POWER_STREAMING_2HVX_CLIENT_CLASS = 0x10, /**< Camera streaming with 2 HVX client class */ +} HAP_power_app_type_payload; +/** + * @} // HAP_power_enums + */ + +/** Payload for HAP_power_set_linelock */ +typedef struct { + void* startAddress; /**< Start address of the memory region to be locked. */ + uint32 size; /**< Size (bytes) of the memory region to be locked. Set size + * to 0 to unlock memory. */ + uint32 throttleBlockSize; /**< Block size for throttling, in bytes; + * 0 for no throttling. The region to be locked will be divided into + * blocks of this size for throttling purposes. + * Use for locking larger cache blocks. + * Applicable only when enabling line locking.Only ONE throttled linelock call is supported at this time. + * You can linelock additional regions (without throttling) using HAP_power_set_linelock_nothrottle*/ + uint32 throttlePauseUs; /**< Pause to be applied between locking each block, in microseconds. Applicable only when enabling line locking*/ +} HAP_power_linelock_payload; + +/** Payload for HAP_power_set_linelock_nothrottle */ +typedef struct { + void* startAddress; /**< Start address of the memory region to be locked. */ + uint32 size; /**< Size (bytes) of the memory region to be locked. Set size to 0 + * to unlock memory */ +} HAP_power_linelock_nothrottle_payload; + +/** @defgroup HAP_power_enums HAP POWER enums + * @{ + */ +/** Option for dcvs payload */ +typedef enum { + HAP_DCVS_ADJUST_UP_DOWN = 0x1, /**< increase and decrease core/bus clock speed. */ + HAP_DCVS_ADJUST_ONLY_UP = 0x2, /**< restricts DCVS from lowering the clock speed below the requested value . */ +} HAP_power_dcvs_payload_option; +/** + * @} // HAP_power_enums + */ + +/** Payload for HAP_power_set_DCVS */ +typedef struct { + boolean dcvs_enable; /**< Set to TRUE to participate in DCVS, and FALSE otherwise. */ + HAP_power_dcvs_payload_option dcvs_option; /**< Set to one of + * HAP_DCVS_ADJUST_UP_DOWN - Allows for DCVS to adjust up and down. + * HAP_DCVS_ADJUST_ONLY_UP - Allows for DCVS to adjust up only. */ +} HAP_power_dcvs_payload; + +/** @defgroup HAP_power_enums HAP POWER enums + * @{ + */ +/** Voltage corners for HAP DCVS V2 interface */ +typedef enum { + HAP_DCVS_VCORNER_DISABLE, + HAP_DCVS_VCORNER_SVS2, + HAP_DCVS_VCORNER_SVS, + HAP_DCVS_VCORNER_SVS_PLUS, + HAP_DCVS_VCORNER_NOM, + HAP_DCVS_VCORNER_NOM_PLUS, + HAP_DCVS_VCORNER_TURBO, + HAP_DCVS_VCORNER_TURBO_PLUS, + HAP_DCVS_VCORNER_TURBO_L2, /**< On targets released till Kailua, HAP_DCVS_VCORNER_TURBO_L2 level will be treated as HAP_DCVS_VCORNER_TURBO_PLUS */ + HAP_DCVS_VCORNER_TURBO_L3, /**< On targets released till Kailua, HAP_DCVS_VCORNER_TURBO_L3 level will be treated as HAP_DCVS_VCORNER_TURBO_PLUS */ + HAP_DCVS_VCORNER_MAX = 255, +} HAP_dcvs_voltage_corner_t; + +/** +* Expanded voltage corners for HAP_power_set corner voting options +*/ +typedef enum { + HAP_DCVS_EXP_VCORNER_DISABLE = 0, + HAP_DCVS_EXP_VCORNER_MIN = 0x100, + /**< Selects the minimum voltage corner defined for the chipset */ + HAP_DCVS_EXP_VCORNER_LOW_SVS_D2 = 0x134, + HAP_DCVS_EXP_VCORNER_LOW_SVS_D1 = 0x138, + HAP_DCVS_EXP_VCORNER_LOW_SVS = 0x140, + HAP_DCVS_EXP_VCORNER_SVS = 0x180, + HAP_DCVS_EXP_VCORNER_SVS_L1 = 0x1C0, + HAP_DCVS_EXP_VCORNER_NOM = 0x200, + HAP_DCVS_EXP_VCORNER_NOM_L1 = 0x240, + HAP_DCVS_EXP_VCORNER_TUR = 0x280, + HAP_DCVS_EXP_VCORNER_TUR_L1 = 0x2A0, + HAP_DCVS_EXP_VCORNER_TUR_L2 = 0x2B0, + HAP_DCVS_EXP_VCORNER_TUR_L3 = 0x2C0, + HAP_DCVS_EXP_VCORNER_MAX = 0xFFFF, + /**< Selects the maximum voltage corner defined for the chipset */ +} HAP_dcvs_exp_voltage_corner_t; + +/** +* Perf modes to specify clock frequency level within +* target voltage corner. +*/ +typedef enum { + HAP_CLK_PERF_HIGH = 0, /**< To select max frequency at target voltage corner. */ + HAP_CLK_PERF_LOW, /**< To select min frequency at target voltage corner. */ +} HAP_clk_perf_mode_t; + +/** + * @} // HAP_power_enums + */ + +#define HAP_DCVS_VCORNER_SVSPLUS HAP_DCVS_VCORNER_SVS_PLUS +#define HAP_DCVS_VCORNER_NOMPLUS HAP_DCVS_VCORNER_NOM_PLUS +#define HAP_DCVS_VCORNER_TURBO_L1 HAP_DCVS_VCORNER_TURBO_PLUS + +/** DCVS parameters for HAP_power_dcvs_v2_payload */ +typedef struct { + HAP_dcvs_voltage_corner_t target_corner; /**< target voltage corner */ + HAP_dcvs_voltage_corner_t min_corner; /**< minimum voltage corner */ + HAP_dcvs_voltage_corner_t max_corner; /**< maximum voltage corner */ + uint32 param1; /**< reserved */ + uint32 param2; /**< reserved */ + uint32 param3; /**< reserved */ +} HAP_dcvs_params_t; + +/** Core clock parameters for HAP_power_dcvs_v3_payload */ +typedef struct { + HAP_dcvs_voltage_corner_t target_corner; /**< target voltage corner */ + HAP_dcvs_voltage_corner_t min_corner; /**< minimum voltage corner */ + HAP_dcvs_voltage_corner_t max_corner; /**< maximum voltage corner */ + uint32 param1; /**< reserved */ + uint32 param2; /**< reserved */ + uint32 param3; /**< reserved */ +} HAP_core_params_t; + +/** Bus clock parameters for HAP_power_dcvs_v3_payload */ +typedef struct { + HAP_dcvs_voltage_corner_t target_corner; /**< target voltage corner */ + HAP_dcvs_voltage_corner_t min_corner; /**< minimum voltage corner */ + HAP_dcvs_voltage_corner_t max_corner; /**< maximum voltage corner */ + uint32 param1; /**< reserved */ + uint32 param2; /**< reserved */ + uint32 param3; /**< reserved */ +} HAP_bus_params_t; + +/** DCVS v3 parameters for HAP_power_dcvs_v3_payload */ +typedef struct { + uint32 param1; /**< reserved */ + uint32 param2; /**< reserved */ + uint32 param3; /**< reserved */ + uint32 param4; /**< reserved */ + uint32 param5; /**< reserved */ + uint32 param6; /**< reserved */ +} HAP_dcvs_v3_params_t; + +/** @defgroup HAP_power_enums HAP POWER enums + * @{ + */ +/** option for dcvs_v2 payload */ +typedef enum { + HAP_DCVS_V2_ADJUST_UP_DOWN = 0x1, /**< Allows for DCVS to adjust up and down. */ + HAP_DCVS_V2_ADJUST_ONLY_UP = 0x2, /**< Allows for DCVS to adjust up only. */ + HAP_DCVS_V2_POWER_SAVER_MODE = 0x4, /**< HAP_DCVS_POWER_SAVER_MODE - Higher thresholds for power efficiency. */ + HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE = 0x8, /**< HAP_DCVS_POWER_SAVER_AGGRESSIVE_MODE - Higher thresholds for power efficiency with faster ramp down. */ + HAP_DCVS_V2_PERFORMANCE_MODE = 0x10, /**< HAP_DCVS_PERFORMANCE_MODE - Lower thresholds for maximum performance */ + HAP_DCVS_V2_DUTY_CYCLE_MODE = 0x20, /**< HAP_DCVS_DUTY_CYCLE_MODE - only for HVX based clients. + * For streaming class clients: + * > detects periodicity based on HVX usage + * > lowers clocks in the no HVX activity region of each period. + * For compute class clients: + * > Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity again. + * > Latency involved in bringing up the clock with be at max 1 to 2 ms. */ + + + +} HAP_power_dcvs_v2_payload_option; +/** + * @} // HAP_power_enums + */ +/** Payload for HAP_power_set_DCVS_v2 */ +typedef struct { + boolean dcvs_enable; /**< Set to TRUE to participate in DCVS, and FALSE otherwise */ + HAP_power_dcvs_v2_payload_option dcvs_option; /**< Set to one of HAP_power_dcvs_v2_payload_option */ + boolean set_latency; /**< TRUE to set latency parameter, otherwise FALSE */ + uint32 latency; /**< sleep latency */ + boolean set_dcvs_params; /**< TRUE to set DCVS params, otherwise FALSE */ + HAP_dcvs_params_t dcvs_params; /**< DCVS parameters */ +} HAP_power_dcvs_v2_payload; + +/** Payload for HAP_power_set_DCVS_v3 */ +typedef struct { + boolean set_dcvs_enable; /**< TRUE to consider DCVS enable/disable and option parameters, otherwise FALSE */ + boolean dcvs_enable; /**< Set to TRUE to participate in DCVS, and FALSE otherwise. */ + HAP_power_dcvs_v2_payload_option dcvs_option; /**< Set to one of HAP_power_dcvs_v2_payload_option */ + boolean set_latency; /**< TRUE to consider latency parameter, otherwise FALSE */ + uint32 latency; /**< sleep latency */ + boolean set_core_params; /**< TRUE to consider core clock params, otherwise FALSE */ + HAP_core_params_t core_params; /**< Core clock parameters */ + boolean set_bus_params; /**< TRUE to consider bus clock params, otherwise FALSE */ + HAP_bus_params_t bus_params; /**< Bus clock parameters */ + boolean set_dcvs_v3_params; /**< TRUE to consider DCVS v3 params, otherwise FALSE */ + HAP_dcvs_v3_params_t dcvs_v3_params; /**< DCVS v3 parameters */ + boolean set_sleep_disable; /**< TRUE to consider sleep disable/enable parameter, otherwise FALSE */ + unsigned char sleep_disable; /**< See HAP_DCVS_LPM_LEVEL1, HAP_DCVS_LPM_LEVEL2, HAP_DCVS_LPM_LEVEL3 and HAP_DCVS_LPM_ENABLE ALL above */ +} HAP_power_dcvs_v3_payload; + +/** @defgroup HAP_power_enums HAP POWER enums + * @{ + */ + /** Type for dcvs update request */ +typedef enum { + HAP_POWER_UPDATE_DCVS = 1, + HAP_POWER_UPDATE_SLEEP_LATENCY, + HAP_POWER_UPDATE_DCVS_PARAMS, +} HAP_power_update_type_t; +/** + * @} // HAP_power_enums + */ +/** Payload for DCVS update */ +typedef struct { + boolean dcvs_enable; /**< TRUE for DCVS enable and FALSE for DCVS disable */ + HAP_power_dcvs_v2_payload_option dcvs_option; /**< Requested DCVS policy in case DCVS enable is TRUE */ +} HAP_power_update_dcvs_t; + +/** Payload for latency update */ +typedef struct { + boolean set_latency; /**< TRUE if sleep latency request has to be considered */ + unsigned int latency; /**< Sleep latency request in micro seconds */ +} HAP_power_update_latency_t; + +/** Payload for DCVS params update */ +typedef struct { + boolean set_dcvs_params; /**< Flag to mark DCVS params structure validity, TRUE for valid DCVS + *params request and FALSE otherwise */ + HAP_dcvs_params_t dcvs_params; /**< Intended DCVS params if set_dcvs_params is set to TRUE */ +} HAP_power_update_dcvs_params_t; + +/** Payload for HAP_power_set_DCVS_v2 */ +typedef struct { + HAP_power_update_type_t update_param; /**< Type for which param to update */ + union { + HAP_power_update_dcvs_t dcvs_payload; + HAP_power_update_latency_t latency_payload; + HAP_power_update_dcvs_params_t dcvs_params_payload; + }; /**< Update payload for DCVS, latency or DCVS params */ +} HAP_power_dcvs_v2_update_payload; + +/** Payload for HAP_power_set_streamer */ +typedef struct { + boolean set_streamer0_clk; /**< Set streamer 0 clock */ + boolean set_streamer1_clk; /**< Set streamer 1 clock */ + unsigned int streamer0_clkFreqHz; /**< Streamer 0 clock frequency */ + unsigned int streamer1_clkFreqHz; /**< Streamer 1 clock frequency */ + HAP_freq_match_type freqMatch; /**< Clock frequency match */ + uint32 param1; /**< Reserved for future streamer parameters */ + uint32 param2; /**< Reserved for future streamer parameters */ + uint32 param3; /**< Reserved for future streamer parameters */ +} HAP_power_streamer_payload; + +/** +* Payload for HAP_power_set_HMX_v2. +* Provides user flexibility to vote for HMX clock based on either voltage +* corner or frequency. User can also provide DCVS limits for HMX clock when +* DCVS participation is enabled via HAP_power_set_DCVS/_v2/_v3 options. +* On chipsets without separate HMX clock, requests made for target corner or +* frequency will return AEE_EBADPARM error. +*/ + +typedef struct { + boolean set_power; /**< Set to TRUE to consider HMX power_up parameter to turn ON/OFF HMX, otherwise FALSE. */ + boolean power_up; /**< Set to TRUE to turn on HMX, and FALSE to turn off. */ + boolean set_clock; /**< TRUE to consider HMX clock parameters. All the following parameters + * will be ignored if set to FALSE. By default, lowest HMX clock frequency will be selected. + * The default being 0 for pick_default, target_corner, freq_mhz and floor_freq_mhz. */ + boolean pick_default; /**< Set to TRUE to select default HMX clock based on the voted + * Q6 core clock level from the same HAP_power_set context, otherwise FALSE. + * When TRUE, target_corner, freq_mhz and floor_freq_mhz params + * should be set to 0.*/ + HAP_dcvs_exp_voltage_corner_t target_corner; /**< Target voltage corner. See HAP_dcvs_exp_voltage_corner_t. + * For target_corner > 0, pick_default, freq_mhz and floor_freq_mhz + * params should be set to 0. + * Maximum target_corner request among the requesting clients + * will be considered as the final vote. */ + HAP_dcvs_exp_voltage_corner_t min_corner; /**< minimum voltage corner for DCVS. See HAP_dcvs_exp_voltage_corner_t. */ + HAP_dcvs_exp_voltage_corner_t max_corner; /**< maximum voltage corner for DCVS. See HAP_dcvs_exp_voltage_corner_t. */ + HAP_clk_perf_mode_t perf_mode; /**< To specify clock frequency level within target voltage corner. */ + uint32 freq_mhz; /**< Frequency request in MHz. freq_mhz requests across clients + * will be accumulated. For freq_mhz > 0, target_corner and + * pick_default should be set to 0. */ + uint32 floor_freq_mhz; /**< Floor frequency request in MHz. + * For floor_freq_mhz > 0, target_corner and pick_default + * should be set to 0. Maximum floor_freq_mhz request among the + * requesting clients will be considered. + * Maximum between the aggregated freq_mhz and floor_freq_mhz + * will be considered as the final frequency request. */ + uint32 param1; /**< Reserved */ + uint32 param2; /**< Reserved */ + uint32 param3; /**< Reserved */ +} HAP_power_hmx_payload_v2; + +/** Payload for HAP_power_set_CENG_bus */ +typedef struct { + HAP_dcvs_voltage_corner_t target_corner; /**< Target voltage corner. For target_corner > 0, + * bwBytePerSec and busbwUsagePercentage params should be set to 0. */ + HAP_dcvs_voltage_corner_t min_corner; /**< Minimum voltage corner for DCVS */ + HAP_dcvs_voltage_corner_t max_corner; /**< Maximum voltage corner for DCVS */ + HAP_clk_perf_mode_t perf_mode; /**< To specify clock frequency level within target voltage corner */ + uint64 bwBytePerSec; /**< Clock request in terms of bandwidth (bytes per second). + * For bwBytePerSec > 0, target_corner should be set to 0. */ + uint32 busbwUsagePercentage; /**< Percentage of time during which bwBytesPerSec BW is required from the bus (0..100) */ + uint32 param1; /**< Reserved */ + uint32 param2; /**< Reserved */ + uint32 param3; /**< Reserved */ +} HAP_power_ceng_bus_payload; + +/** @defgroup HAP_power_enums HAP POWER enums + * @{ + */ + /** Identifies the HAP power request type */ +typedef enum { + HAP_power_set_mips_bw = 1, /**< Requests for MIPS. Provides + * fine-grained control to set MIPS values. + * Payload is set to HAP_power_payload */ + HAP_power_set_HVX, /**< Requests to enable / disable HVX + * Payload is set to HAP_power_hvx_payload */ + HAP_power_set_apptype, /**< Sets the app_type + * Payload is set to HAP_power_app_type_payload */ + HAP_power_set_linelock, /**< Sets the throttled L2 cache line locking parameters. + * Only one throttled call is supported at this time. Additional + * un-throttled line-locks can be performed using HAP_power_set_linelock_nothrottle + * Payload is set to HAP_power_linelock_payload */ + HAP_power_set_DCVS, /**< Requests to participate / stop participating in DCVS */ + HAP_power_set_linelock_nothrottle, /**< Sets the L2 cache line locking parameters (non-throttled). + * Payload is set to HAP_power_linelock_nothrottle_payload */ + HAP_power_set_DCVS_v2, /**< Requests to participate / stop participating in DCVS_v2 */ + HAP_power_set_vapss, /**< Sets the VAPSS core clock and DDR/IPNOC bandwidth + * Payload is set to HAP_power_vapss_payload */ + HAP_power_set_vapss_v2, /**< Sets the VAPSS core DMA/HCP clocks and DDR/IPNOC bandwidths + * Payload is set to HAP_power_vapss_payload_v2 */ + HAP_power_set_dcvs_v2_update, /**< Updates DCVS params + * Payload is set to HAP_power_dcvs_v2_update_payload */ + HAP_power_set_streamer, /**< Sets the streamer core clocks + * Payload is set to HAP_power_streamer_payload */ + HAP_power_set_DCVS_v3, /**< Updates DCVS params + * Payload is set to HAP_power_dcvs_v3_payload */ + HAP_power_set_HMX, /**< Requests to enable / disable HMX + * Payload is set to HAP_power_hmx_payload */ + HAP_power_set_HMX_v2, /**< Requests for HMX power management along with + * HMX clock requirement. On chipsets without separate HMX + * clock, will return AEE_EBADPARM error + * if target corner / frequency is requested. + * Payload is set to HAP_power_hmx_payload_v2 */ + HAP_power_set_CENG_bus, /**< To vote for CENG bus + * Payload is set to HAP_power_ceng_bus_payload */ +} HAP_Power_request_type; +/** + * @} // HAP_power_enums + */ + +/** Data type to change power values on the DSP */ +typedef struct { + HAP_Power_request_type type; /**< Identifies the request type */ + union{ + HAP_power_mips_bw_payload mips_bw; /**< Requests for performance level */ + HAP_power_vapss_payload vapss; /**< Sets the VAPSS core clock and DDR/IPNOC bandwidth */ + HAP_power_vapss_payload_v2 vapss_v2; /**< Sets the VAPSS core clock and DDR/IPNOC bandwidth */ + HAP_power_streamer_payload streamer; /**< Sets the streamer core clocks */ + HAP_power_hvx_payload hvx; /**< Requests to enable / disable HVX */ + HAP_power_app_type_payload apptype; /**< Sets the app_type */ + HAP_power_linelock_payload linelock; /**< Sets the throttled L2 cache linelock parameters. Only one + * throttled linelock is permitted at this time. Additional + * un-throttled linelocks can be performed using linelock_nothrottle */ + HAP_power_dcvs_payload dcvs; /**< Updates DCVS params */ + HAP_power_dcvs_v2_payload dcvs_v2; /**< Updates DCVS_v2 params */ + HAP_power_dcvs_v2_update_payload dcvs_v2_update; /**< Updates DCVS_v2_update params */ + HAP_power_linelock_nothrottle_payload linelock_nothrottle; /**< Sets the un-throttled L2 cache linelock parameters */ + HAP_power_dcvs_v3_payload dcvs_v3; /**< Updates DCVS_v3 params */ + HAP_power_hmx_payload hmx; /**< Requests to turn on / off HMX + * When request is to turn on HMX, on chipsets with separate HMX clock, + * a default HMX clock will be selected based on the voted + * Q6 core clock level from the same HAP_power_set context. + */ + HAP_power_hmx_payload_v2 hmx_v2; /**< Requests for HMX power management along with HMX clock requirement. + * On chipsets without separate HMX clock, will return AEE_EBADPARM error + * if target corner / frequency is requested. */ + HAP_power_ceng_bus_payload ceng_bus; /**< Votes for CENG bus */ + }; +} HAP_power_request_t; + +/** @defgroup HAP_power_functions HAP POWER functions + * @{ + */ +/** +* Method to set power values from the DSP +* @param[in] context - To identify the power client +* @param[in] request - Request params. +* @retval 0 on success, AEE_EMMPMREGISTER on MMPM client register request failure, -1 on unknown error +*/ +int HAP_power_set(void* context, HAP_power_request_t* request); +/** + * @} // HAP_power_functions + */ + +/** @defgroup HAP_power_enums HAP POWER enums + * @{ + */ + /** Identifies the HAP power response type */ +typedef enum { + HAP_power_get_max_mips = 1, /**< Returns the max mips supported (max_mips) */ + HAP_power_get_max_bus_bw, /**< Returns the max bus bandwidth supported (max_bus_bw) */ + HAP_power_get_client_class, /**< Returns the client class (client_class) */ + HAP_power_get_clk_Freq, /**< Returns the core clock frequency (clkFreqHz) */ + HAP_power_get_aggregateAVSMpps, /**< Returns the aggregate Mpps used by audio and voice (clkFreqHz) */ + HAP_power_get_dcvsEnabled, /**< Returns the dcvs status (enabled / disabled) */ + HAP_power_get_vapss_core_clk_Freq, /**< Returns the VAPSS core clock frequency (clkFreqHz) */ + HAP_power_get_dma_core_clk_Freq, /**< Returns the DMA core clock frequency (clkFreqHz) */ + HAP_power_get_hcp_core_clk_Freq, /**< Returns the HCP core clock frequency (clkFreqHz) */ + HAP_power_get_streamer0_core_clk_Freq, /**< Returns the streamer 0 core clock frequency (clkFreqHz) */ + HAP_power_get_streamer1_core_clk_Freq, /**< Returns the streamer 1 core clock frequency (clkFreqHz) */ +} HAP_Power_response_type; +/** + * @} // HAP_power_enums + */ + +/** Data type to retrieve power values from the DSP */ +typedef struct { + HAP_Power_response_type type; /**< Identifies the type to retrieve. */ + union{ + unsigned int max_mips; /**< Max mips supported */ + uint64 max_bus_bw; /**< Max bus bw supported */ + unsigned int client_class; /**< Current client class */ + unsigned int clkFreqHz; /**< Current core CPU frequency */ + unsigned int aggregateAVSMpps; /**< Aggregate AVS Mpps used by audio and voice */ + boolean dcvsEnabled; /**< Indicates if dcvs is enabled / disabled. */ + }; +} HAP_power_response_t; + +/** @defgroup HAP_power_functions HAP POWER functions + * @{ + */ + +/** +* Method to retrieve power values from the DSP +* @param[in] context - Ignored +* @param[out] response - Response. +*/ +int HAP_power_get(void* context, HAP_power_response_t* response); + +/** +* Method to initialize dcvs v3 structure in request param. It enables +* flags and resets params for all fields in dcvs v3. So, this +* can also be used to remove applied dcvs v3 params and restore +* defaults. +* @param[in] request - Pointer to request params. +*/ +static inline void HAP_power_set_dcvs_v3_init(HAP_power_request_t* request) { + memset(request, 0, sizeof(HAP_power_request_t) ); + request->type = HAP_power_set_DCVS_v3; + request->dcvs_v3.set_dcvs_enable = TRUE; + request->dcvs_v3.dcvs_enable = TRUE; + request->dcvs_v3.dcvs_option = HAP_DCVS_V2_POWER_SAVER_MODE; + request->dcvs_v3.set_latency = TRUE; + request->dcvs_v3.latency = 65535; + request->dcvs_v3.set_core_params = TRUE; + request->dcvs_v3.set_bus_params = TRUE; + request->dcvs_v3.set_dcvs_v3_params = TRUE; + request->dcvs_v3.set_sleep_disable = TRUE; + return; +} + +/** +* Method to enable/disable dcvs and set particular dcvs policy. +* @param[in] context - User context. +* @param[in] dcvs_enable - TRUE to enable dcvs, FALSE to disable dcvs. +* @param[in] dcvs_option - To set particular dcvs policy. In case of dcvs disable +* request, this param will be ignored. +* @returns - 0 on success +*/ +static inline int HAP_power_set_dcvs_option(void* context, boolean dcvs_enable, + HAP_power_dcvs_v2_payload_option dcvs_option) { + HAP_power_request_t request; + memset(&request, 0, sizeof(HAP_power_request_t) ); + request.type = HAP_power_set_DCVS_v3; + request.dcvs_v3.set_dcvs_enable = TRUE; + request.dcvs_v3.dcvs_enable = dcvs_enable; + if(dcvs_enable) + request.dcvs_v3.dcvs_option = dcvs_option; + return HAP_power_set(context, &request); +} + +/** +* Method to set/reset sleep latency. +* @param[in] context - User context. +* @param[in] latency - Sleep latency value in microseconds, should be > 1. +* Use 65535 max value to reset it to default. +* @returns - 0 on success +*/ +static inline int HAP_power_set_sleep_latency(void* context, uint32 latency) { + HAP_power_request_t request; + memset(&request, 0, sizeof(HAP_power_request_t) ); + request.type = HAP_power_set_DCVS_v3; + request.dcvs_v3.set_latency = TRUE; + request.dcvs_v3.latency = latency; + return HAP_power_set(context, &request); +} + +/** +* Method to set/reset DSP core clock voltage corners. +* @param[in] context - User context. +* @param[in] target_corner - Target voltage corner. +* @param[in] min_corner - Minimum voltage corner. +* @param[in] max_corner - Maximum voltage corner. +* @returns - 0 on success +*/ +static inline int HAP_power_set_core_corner(void* context, uint32 target_corner, + uint32 min_corner, uint32 max_corner) { + HAP_power_request_t request; + memset(&request, 0, sizeof(HAP_power_request_t) ); + request.type = HAP_power_set_DCVS_v3; + request.dcvs_v3.set_core_params = TRUE; + request.dcvs_v3.core_params.min_corner = (HAP_dcvs_voltage_corner_t) (min_corner); + request.dcvs_v3.core_params.max_corner = (HAP_dcvs_voltage_corner_t) (max_corner); + request.dcvs_v3.core_params.target_corner = (HAP_dcvs_voltage_corner_t) (target_corner); + return HAP_power_set(context, &request); +} + +/** +* Method to set/reset bus clock voltage corners. +* @param[in] context - User context. +* @param[in] target_corner - Target voltage corner. +* @param[in] min_corner - Minimum voltage corner. +* @param[in] max_corner - Maximum voltage corner. +* @returns - 0 on success +*/ +static inline int HAP_power_set_bus_corner(void* context, uint32 target_corner, + uint32 min_corner, uint32 max_corner) { + HAP_power_request_t request; + memset(&request, 0, sizeof(HAP_power_request_t) ); + request.type = HAP_power_set_DCVS_v3; + request.dcvs_v3.set_bus_params = TRUE; + request.dcvs_v3.bus_params.min_corner = (HAP_dcvs_voltage_corner_t) (min_corner); + request.dcvs_v3.bus_params.max_corner = (HAP_dcvs_voltage_corner_t) (max_corner); + request.dcvs_v3.bus_params.target_corner = (HAP_dcvs_voltage_corner_t) (target_corner); + return HAP_power_set(context, &request); +} + +/** +* Method to select low power mode. +* @param[in] context - User context. +* @param[in] sleep_disable - See HAP_DCVS_LPM_LEVEL1, HAP_DCVS_LPM_LEVEL2, HAP_DCVS_LPM_LEVEL3 and HAP_DCVS_LPM_ENABLE ALL above. +* @returns - 0 on success +*/ +static inline int HAP_power_set_sleep_mode(void* context, unsigned char sleep_disable) { + HAP_power_request_t request; + memset(&request, 0, sizeof(HAP_power_request_t) ); + request.type = HAP_power_set_DCVS_v3; + request.dcvs_v3.set_sleep_disable = TRUE; + request.dcvs_v3.sleep_disable = sleep_disable; + return HAP_power_set(context, &request); +} + + +/** +* This API is deprecated and might generate undesired results. +* Please use the HAP_power_get() and HAP_power_set() APIs instead. +* Requests a performance level by percentage for clock speed +* and bus speed. Passing 0 for any parameter results in no +* request being issued for that particular attribute. +* @param[in] clock - percentage of target's maximum clock speed +* @param[in] bus - percentage of target's maximum bus speed +* @param[in] latency - maximum hardware wake up latency in microseconds. The +* higher the value the deeper state of sleep +* that can be entered but the longer it may +* take to awaken. +* @retval 0 on success +* @par Comments : Performance metrics vary from target to target so the +* intent of this API is to allow callers to set a relative +* performance level to achieve the desired balance between +* performance and power saving. +*/ +int HAP_power_request(int clock, int bus, int latency); + +/** +* This API is deprecated and might generate undesired results. +* Please use the HAP_power_get() and HAP_power_set() APIs instead. +* Requests a performance level by absolute values. Passing 0 +* for any parameter results in no request being issued for that +* particular attribute. +* @param[in] clock - speed in MHz +* @param[in] bus - bus speed in MHz +* @param[in] latency - maximum hardware wakeup latency in microseconds. The +* higher the value the deeper state of +* sleep that can be entered but the +* longer it may take to awaken. +* @retval 0 on success +* @par Comments : This API allows callers who are aware of their target +* specific capabilities to set them explicitly. +*/ +int HAP_power_request_abs(int clock, int bus, int latency); + +/** +* This API is deprecated and might generate undesired results. +* Please use the HAP_power_get() and HAP_power_set() APIs instead. +* queries the target for its clock and bus speed capabilities +* @param[out] clock_max - maximum clock speed supported in MHz +* @param[out] bus_max - maximum bus speed supported in MHz +* @retval 0 on success +*/ +int HAP_power_get_max_speed(int* clock_max, int* bus_max); + +/** +* This API is deprecated and might generate undesired results. +* Please use the HAP_power_get() and HAP_power_set() APIs instead. +* Upvote for HVX power +* @retval 0 on success +*/ +int HVX_power_request(void); + +/** +* This API is deprecated and might generate undesired results. +* Please use the HAP_power_get() and HAP_power_set() APIs instead. +* Downvote for HVX power +* @retval 0 on success +*/ +int HVX_power_release(void); + +/** +* Method to destroy clients created through HAP_power_set +* @param[in] context - To uniquely identify the client +* @retval 0 on success, AEE_ENOSUCHCLIENT on Invalid context, -1 on unknown error +* @brief DO NOT call this API directly, use HAP_power_destroy instead. +*/ +int HAP_power_destroy_client(void *context); + +/** +* @param[in] client - To uniquely identify the client context. +* @retval 0 on success, AEE_EUNSUPPORTEDAPI if the API is not supported on the DSP image, AEE_ENOSUCHCLIENT on Invalid context, -1 on unknown error +* @brief Method to destroy clients created through HAP_power_set, wrapper to HAP_power_destroy_client API +*/ +static inline int HAP_power_destroy(void *client){ + if(0 != HAP_power_destroy_client) + return HAP_power_destroy_client(client); + return AEE_EUNSUPPORTEDAPI; +} + +/** +* Method to create user client context +* @retval context for client +*/ +static inline void* HAP_utils_create_context(void) { + /* + * Allocate 1 byte of memory for a unique context identifier + * Clients can also allocate memory and use it as unique context identifier + */ + return malloc(1); +} + +/** +* Method to destroy user client context +* @param context of client +*/ +static inline void HAP_utils_destroy_context(void* context) { + free(context); +} + +/** + * @} // HAP_power_functions + */ +#ifdef __cplusplus +} +#endif +#endif //_HAP_POWER_H + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.md new file mode 100755 index 0000000000000..44ad184ac0eca --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.md @@ -0,0 +1,946 @@ +# Introduction {#intro} + +The Hexagon SDK provides APIs to control DSP core and bus clocks based on power and performance needs. +By default, every compute session votes for NOMINAL voltage corner and powers on HVX. +Clients can choose to overwrite this by HAP power APIs below. + +# HAP_power API {#api} + + +## API Overview {#api-overview} + +HAP_power_* APIS are used by clients to override the power settings of the DSPs according to their needs. This API is supported on ADSP, CDSP and SLPI. + +The HAP power API contains a set of interfaces that allow programmers to adjust the DSP power usage as per the application's power requirement, thereby providing a good balance between power consumption and performance. + +* HAP_power_set(): This is used to vote for performance levels on the DSP +* HAP_power_get(): This is used to query the DSP for current performance levels +* HAP_power_destroy(): This is used to destroy power clients created through HAP_power_set API + +::HAP_power_set can be used to control these parameters on the DSP: +* DSP MIPS +* Bus speed / bandwidth +* Dynamic scaling of bus and DSP clocks and bus speeds (DCVS) +* Application type (client class), more details on this can be found [here](#app-type) +* L2 cache line locking +* Hexagon Vector eXtension (HVX) blocks + +::HAP_power_get can be used to query the DSP for these parameters: +* Max MIPS supported +* Max bus speed / bandwidth supported +* Current Core clock speed +* Current application type (client class) +* Aggregate Mpps used by audio and voice + +::HAP_power_destroy can be used to destroy the power clients created through HAP_power_set API. + Destroys any existing HAP_power votes associated with the provided client context, and disassociates that context from HAP_power. + + +## Usage {#usage} + +See HAP_power.h for more information on this API. + +::HAP_power_set : This accepts two parameters +* context - Unique identifier (explained below) +* request - The power request. + +context is a unique identifier (in the scope of the PD) provided by the user to identify an independent voting client of HAP_power. For each unique context passed in a HAP_power_set invocation, HAP_power adds a new client to its state to be associated with that context. + +On targets after Lahaina, helper APIs HAP_utils_create_context and HAP_utils_destroy_context are added to create and destroy unique context identifiers. If these are not available, the recommended alternative is to create a context by allocating a dummy byte and using the pointer value as the context, and freeing that byte later after destroying the context's associated HAP_power client via HAP_power_destroy. +* HAP_utils_create_context(): This is used to create a unique context identifier +* HAP_utils_destroy_context(): This is used to destroy unique context identifier. HAP_utils_destroy_context should only be called on a context after destroying the HAP_power client associated to that context, via HAP_power_destroy(context). Failure to destroy both in the proper order may cause a leak. + +Refer to the following table for voting/unvoting call flow: + +
Voting/Unvoting call flowLibrary code +
Create unique client contextcontext = userLibCodeToCreateUniqueContext() (or) context = HAP_utils_create_context() +
Create power client and voteHAP_power_set(context, request) +
Destroy power clientHAP_power_destroy(context) (or) HAP_power_destroy_client(context) +
Destroy unique client contextuserLibCodeToDestroyUniqueContext(context) (or) HAP_utils_destroy_context(context) +
+ +NOTE: Using a context set to NULL has specific implications, discussed below in [default voting](#default_voting) + +Example: Module1 and Module2 are two different clients running in the same user PD on DSP. Module1 creates a new, unique client context and votes for its needs. Module2 also creates a new, unique client context and votes for its needs. The figure below shows the different client contexts and their votes to power manager. + +![screenshot](../../images/hap_power.png) + +The type in the request is set to one of: +* HAP_power_set_mips_bw: Used to set MIPS and / or bus speed (bandwidth). The payload in this case should contain HAP_power_mips_bw_payload. +* HAP_power_set_HVX: Used to enable / disable power for HVX. The payload in this case should contain HAP_power_hvx_payload. +* HAP_power_set_apptype: Used to set the application type. The payload in this case should contain ::HAP_power_app_type_payload. +* HAP_power_set_linelock: Used to line lock memory in the L2 cache. The payload in this case should contain HAP_power_linelock_payload. +* HAP_power_set_DCVS: Used to participate / stop participating in DCVS. The payload in this case should contain HAP_power_dcvs_payload. +* HAP_power_set_DCVS_v2: Enhanced version of HAP_power_set_DCVS with more options. The payload in this case should contain HAP_power_dcvs_v2_payload. +* HAP_power_set_DCVS_v3: Enhanced version of HAP_power_set_DCVS_v2 with more options to select core and bus operating corners separately. The payload in this case should contain HAP_power_dcvs_v3_payload. + +NOTE: +* More details on HAP_power_set_DCVS_v2 can be found [here](#DCVS_V2). +* HAP_power_set_DCVS_v3 is supported from SM8250 onwards. More details can be found [here](#DCVS_V3). +* In Older targets, maximum of 8 clients can be created per PD, (including the default client). This limitation has been removed from SM8250 onwards. +* HAP_power_hmx_payload_v2 is supported starting with v75. On chipsets (v75 onwards) without separate HMX clock plan, requests made for target corner or +frequency will return AEE_EBADPARM (invalid parameter) error. +* HAP_power_set_CENG_bus is supported from v75 onwards. On chipsets (v75 onwards) not supporting independent Q6-CENG bus clock scaling, this request type +will return AEE_EBADPARM (invalid parameter) error. + +Example is provided below. + +~~~{.c} + //Vote + /* Populate request structure */ + int retVal; + HAP_power_request_t request; + memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. + request.type = HAP_power_set_DCVS_v2; + request.dcvs_v2.dcvs_enable = TRUE; + request.dcvs_v2.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; + request.dcvs_v2.set_latency = TRUE; + request.dcvs_v2.latency = 1000; + request.dcvs_v2.set_dcvs_params = TRUE; + request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_SVS; + request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_TURBO; + request.dcvs_v2.dcvs_params.target_corner = HAP_DCVS_VCORNER_NOM; + /* Call HAP_power_set API with the updated request structure */ + /* cv is a global variable or an address in heap to uniquely identify the clients */ + retVal = HAP_power_set(&cv, &request); + ... +~~~ +::HAP_power_get : This accepts two parameters + +* context - this parameter is ignored and response is for the system level +* response - The power response + +The type in the request is set to one of +* HAP_power_get_max_mips: Used to query for maximum MIPS supported +* HAP_power_get_max_bus_bw: Used to query for maximum bus bandwidth supported +* HAP_power_get_client_class: Used to query for current application type. +* HAP_power_get_clk_Freq: Used to query for current core clock frequency. +* HAP_power_get_aggregateAVSMpps: Used to query for aggregate Mpps used by audio and voice. + +::HAP_power_destroy : This accepts one parameter + +* context - the unique client context identifying the HAP_power client to destroy. + +Example to remove default vote for a PD and to destroy default client. +~~~{.c} + //Vote + int nErr= 0; + + if(0 == (nErr = HAP_power_destroy(NULL))){ + //Client destroyed successfully + } +~~~ + +### Default voting {#default_voting} +In older targets, NULL is a valid HAP_power context that is used by FastRPC to establish a default vote for some reasonable clock settings. In order to override the default vote on these targets, it is necessary for a client to place a canceling vote using the NULL context. + +Permitting clients to use NULL context can lead to conflicts where multiple clients in the same PD may try to independently manage the NULL context. +To address these conflicts, support for any NULL context voting has been removed starting in targets after Lahaina. For these targets, behavior of default voting has been changed. A suitable vote is placed automatically at opportune times (such as startup or object loading) on a unique context, and is automatically removed when no longer needed. For example, the FastRPC driver places a high clock vote when a new session is started on the DSP, and removes it as soon as any user in that session places any other HAP_power vote. This ensures clocks are high during startup and object loading, up until the point the user application is able to place its own vote. + +The recommended client behavior is as follows: + +* Lahaina and older targets: While it is allowed to rely on the default vote to establish reasonable clocks, it is recommended to place a canceling vote (with low/zero clock values as shown below) on the NULL context, plus an active vote for the requirements on a different unique context. + +* Targets after Lahaina: Simply place an active vote for required clock settings on a unique context. + +If a single client implementation is required to work correctly on all targets, the recommendation is as follows: +* Make an attempt to place a canceling NULL-context vote. If the error code AEE_ENULLCONTEXT is returned, it means the target does not support NULL context. Thus, this error can be ignored. +* Place a (non-NULL) unique context vote for the clock requirements. + +On targets that support NULL context default voting, it can be removed using HAP_power_destroy(NULL) or as follows: + +~~~{.c} +req.type = HAP_power_set_DCVS_v2; +req.dcvs_v2.dcvs_enable = FALSE; +req.dcvs_v2.set_latency = FALSE; +req.dcvs_v2.set_dcvs_params = FALSE; +VERIFY(AEE_SUCCESS == (nErr = HAP_power_set(NULL, &req))); +~~~ + + +### Application type/Client class {#app-type} +HAP_power_set() API exposes an API for users to register for an application type. + +'apptype' in ::HAP_power_request_t passed as parameter to HAP_power_set request allows user to register an application as one of the client classes available in ::HAP_power_app_type_payload. +Setting an appropriate client class can be important as this information is used in DSP DCVS, QoS, DSP power management drivers. HAP_power clients who do not explicitly vote their apptype are treated as general compute applications, which is appropriate for most cases. + +####Users of Client class information +DCVS selects HAP_DCVS_V2_POWER_SAVER_MODE as default DCVS policy for COMPUTE and STREAMING class clients. Client can always power their own DCVS policy by issuing a DCVS_v2 request, click [here](#DCVS_V2) for more details on DCVS_v2 request type of HAP_power_set. +QoS driver modifies L2 scoreboard thresholds on detecting STREAMING class clients to allow DSP L2 slave accesses. + + +###DSP DCVS v2 HAP interface {#DCVS_V2} +Based on user configuration, DCVS module in DSP (ADSP/CDSP) can adjust the core and bus clock frequencies based on core and bus usage metrics captured by SysMon. The existing DCVS interface via HAP_power_set() (type: HAP_power_set_DCVS) only allows users to vote for DCVS participation with 2 different options. DSP DCVS v2 algorithm exposes an enhanced set of DCVS options for diversified clients and a simplified voltage corner based voting scheme. On supported targets (8998 and latest), these new DCVS options and voting scheme are exposed to clients via HAP_power_set()(type: HAP_power_set_DCVS_v2). + +####HAP API Support +The HAP_power_set API is enhanced to support the new mode registrations with DSP DCVS logic. Following table illustrates the new type of request and the new dcvs_v2 request structure associated with it. + + +
API HAP_power_set (void* context, HAP_power_request_t* request) +
context Explained [here](#usage). Votes across all contexts will be aggregated accordingly. +
request type HAP_power_set_DCVS_v2 This new request type allows user to request via the new dcvs_v2 request structure. +
dcvs_v2 dcvs_enable DCVS participation flag +
dcvs_option These options instruct DCVS algorithm to use a pre-defined set of thresholds and operation logic based on the selected option. +
set_latency Latency vote validity flag. If FALSE then default sleep latency vote of 65535 micro seconds will be considered. +
latency Sleep latency vote in micro seconds. Valid when the set_latency flag is set to TRUE +
set_dcvs_params DCVS params validity flag. If FALSE then all parameters of dcvs_params will be set to default zero. +
dcvs_params DCVS params structure with flexibility to set upper and lower DCVS thresholds and also vote for core and bus clocks using a voltage corner. +
+ +###DSP DCVS v3 HAP interface {#DCVS_V3} +Based on user configuration, DCVS module in DSP can adjust the core and bus clock frequencies based on core and bus usage metrics captured by SysMon. The existing DCVS v2 algorithm via HAP_power_set() (type: HAP_power_set_DCVS_v2) exposes multiple DCVS options for diversified clients and a simplified voltage corner based voting scheme. But along with existing features, DCVS v3 provides separate voltage corner voting option to user for core and bus clock and also option to disable all low power modes without explicit sleep latency vote need. In scenarios where user is ok for same voltage corner voting for core and bus clock then they can still use DCVS v2. Also, in DCVS v3 user can vote for individual field/multiple fields based on his requirement. On supported targets (SM8250 and latest), these new DCVS options and voting scheme are exposed to clients via HAP_power_set() (type: HAP_power_set_DCVS_v3). Also, added wrapper functions built around same HAP_power_set() (type: HAP_power_set_DCVS_v3) to help user to select and vote for individual functionality in DCVS v3 without bothering about DCVS v3 structure and related details. This document captures information on these new DCVS v3 features and ways to use them. + +####HAP API Support +The HAP_power_set API is enhanced to support the new user options with DCVS v3 with the new request type HAP_power_set_DCVS_v3 with HAP_power_dcvs_v3_payload. + + +
API HAP_power_set (void* context, HAP_power_request_t* request) +
context Explained [here](#usage). Votes across all contexts will be aggregated accordingly. +
request type HAP_power_set_DCVS_v3 This new request type allows user to request via the new dcvs_v3 request structure +
dcvs_v3 set_dcvs_enable DCVS participation validity flag. If FALSE then the dcvs_enable and dcvs_option fields will be ignored. +
dcvs_enable DCVS participation flag. Vaild when the set_dcvs_enable is set to TRUE. +
dcvs_option These options instruct DCVS algorithm to use a pre-defined set of thresholds and operation logic based on the selected option. +
set_latency Latency vote validity flag. If FALSE then the latency field will be ignored. +
latency sleep latency vote in micro seconds. Valid when the set_latency flag is set to TRUE +
set_core_params Core clock params validity flag. If FALSE then the core_params field be ignored. +
core_params Core clock params structure with flexibility to set upper and lower core clock DCVS thresholds and also vote for core clock using a voltage corner. Valid when set_core_params is set to TRUE. +
set_bus_params Bus clock params validity flag. If FALSE then the bus_params field will be ignored. +
bus_params Bus clock params structure with flexibility to set upper and lower bus clock DCVS thresholds and also vote for bus clock using a voltage corner. Valid when set_bus_params is set to TRUE. +
set_dcvs_v3_params Validity flag for reserved DCVS params. If FALSE then the dcvs_v3_params field will be ignored. +
dcvs_v3_params Reserved DCVS params +
set_sleep_disable Sleep param validity flag. If FALSE then the sleep_disable field will be ignored. +
sleep_disable To select low-power mode (LPM). Valid when set_sleep_disable is set to TRUE. Refer to [Sleep Disable](#sleep_disable) for options. +
+ +####Wrapper APIs +There are wrapper functions built around same HAP_power_set()(type: HAP_power_set_DCVS_v3) to help user to select and vote for individual functionality in DCVS v3 without bothering about DCVS v3 structure and related details. Below section provides these APIs details. + +* HAP_power_set_dcvs_v3_init() +* HAP_power_set_dcvs_option() +* HAP_power_set_sleep_latency() +* HAP_power_set_core_corner() +* HAP_power_set_bus_corner() +* HAP_power_set_sleep_mode() + +####DCVS Enable +'dcvs_enable' parameter of dcvs_v2 structure enables user to vote for DCVS participation. + +
Value Description +
TRUE Enable DSP DCVS (if not already enabled). Using dcvs_option, based on the application demand, user can choose a particular option to guide DSP DCVS logic +
FALSE Don't enable DSP DCVS. Valid only when the client requesting is the only one actively voting for clocks or is one among the clients voting for this same option. +
+ +'set_dcvs_enable' and 'dcvs_enable parameters' of dcvs_v3 structure enables user to vote for DCVS participation. + +
set_dcvs_enable FALSE No DCVS request from the client, dcvs_enable and dcvs_option fields will be ignored. +
TRUE Client request for DCVS is valid and desired DCVS participation is provided in dcvs_enable field. +
dcvs_enable TRUE Enable DSP DCVS (if not already enabled). Using dcvs_option, based on the application demand, user can choose a particular option to guide DSP DCVS logic. +
FALSE Don't enable DSP DCVS. Valid only when the client requesting is the only one actively voting for clocks or is one among the clients voting for this same option. +
+ +When a DCVS participating client is active, DCVS logic would be enabled, but the aggregated clients vote requesting for DCVS disable will be considered as a FLOOR request in DCVS logic i.e, DCVS would't lower the clocks below the aggregated value. + +DCVS participation and options are considered only for active clients. A client is deemed inactive when there is no MIPS and bandwidth request (made by setting request type to 'HAP_power_set_mips_bw' in 'HAP_power_set' [API](#usage)) and when target_corner for core and bus under dcvs_params is set to HAP_DCVS_VCORNER_DISABLE. + +####DCVS Options +'dcvs_option' parameter of dcvs_v2 structure enables user to request for a particular DCVS mode when 'dcvs_enable' option is set to TRUE. + +'dcvs_option' parameter of dcvs_v3 structure enables user to request for a particular DCVS mode when 'set_dcvs_enable' and 'dcvs_enable' both are set to TRUE. + +Following table captures the gist of the available DCVS modes. + + +
Value Description +
HAP_DCVS_V2_ADJUST_UP_DOWN + Legacy option: For clients voting via HAP_power_set_mips_bw request type. +This mode allows DCVS to both increase and decrease core/bus clock speeds based on need. DCVS selects thresholds corresponding to a balanced mode (legacy) of operation with respect to power and performance. + +min_corner and max_corner votes via dcvs_params are used as lower and + +upper limit guidelines in DCVS. + +NOTE: If client votes via target_corner under dcvs_params of this structure, both HAP_DCVS_V2_ADJUST_ONLY_UP and HAP_DCVS_V2_ADJUST_UP_DOWN modes are identical. min_corner and max_corner votes are used as lower and upper limit guidelines in DCVS while using balanced mode (legacy) thresholds. + +
HAP_DCVS_V2_ADJUST_ONLY_UP + Legacy option: For clients voting via HAP_power_set_mips_bw request type. + +This mode restricts DCVS from lowering the clock below the values requested via HAP_power_set_mips_bw request. DCVS can only increase the clock above the requested levels. DCVS selects thresholds corresponding to a balanced mode(legacy) of operation with respect to power and performance. max_corner vote via dcvs_params is used as upper limit guideline in DCVS. + +NOTE: If client votes via target_corner under dcvs_params of this structure, both HAP_DCVS_V2_ADJUST_ONLY_UP and HAP_DCVS_V2_ADJUST_UP_DOWN modes are identical. min_corner and max_corner votes are used as lower and upper limit guidelines in DCVS while using balanced mode (legacy) thresholds. + +
HAP_DCVS_V2_POWER_SAVER_MODE + New option: + +Default for all clients participating in DCVS. DCVS can both increase and decrease the core/bus clock speeds while min_corner and max_corner votes are used as lower and upper limit guidelines. DCVS selects thresholds corresponding to power saving model. This mode is meant for applications where saving power is of higher priority than achieving fastest performance. Performance may be slower in this mode than in HAP_DCVS_V2_PERFORMANCE_MODE or the legacy modes i.e, HAP_DCVS_V2_ADJUST_ONLY_UP HAP_DCVS_V2_ADJUST_UP_DOWN + +
HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE + New option: + +DCVS can both increase and decrease the core/bus clock speeds while min_corner and max_corner votes are used as lower and upper limit guidelines. DCVS selects thresholds corresponding to a power saving model. Further, the DCVS monitoring durations in lowering the clocks is decreased for a faster ramp down and hence greater power saving compared to the power saver mode. This mode is meant for applications where saving power is of higher priority than achieving fastest performance. Performance may be slower in this mode than in HAP_DCVS_V2_PERFORMANCE_MODE HAP_DCVS_V2_POWER_SAVER_MODE or the legacy modes i.e, HAP_DCVS_V2_ADJUST_ONLY_UP HAP_DCVS_V2_ADJUST_UP_DOWN + +
HAP_DCVS_V2_PERFORMANCE_MODE +New option: + +DCVS can both increase and decrease the core/bus clock speeds while min_corner and max_corner votes are used as lower and upper limit guidelines. DCVS selects a set of aggressive thresholds in terms of performance. DCVS can quickly bump up the clocks in this mode assisting higher performance at the cost of power. + +
HAP_DCVS_V2_DUTY_CYCLE_MODE + This mode is for periodic use cases. Starting with Lahaina, DCVS when in this mode detects the periodicity and sets/removes the core and bus clock votes for active/idle durations respectively. This mode helps save power significantly by reducing idle leakage current while keeping the performance intact. Compared to Applications setting/removing clock votes for each active frame to save the power, the DCVS duty cycle mode provides better performance and more power savings, as in this mode, the voting is done upfront by DCVS just before active duration start based on periodicity prediction. +
+In cases where multiple clients have registered different DCVS options, following table depicts the DCVS policy aggregation logic. + +
PERFORMANCE (Yes / No) POWER SAVER (Yes / No) POWER SAVER AGGRESSIVE (Yes / No) BALANCED (UP ONLY/UP AND DOWN clients) (Yes / No) Final DCVS thresholds +
Y Y /N Y /N Y /N PERFORMANCE +
N Y Y /N Y /N POWER SAVER +
N N Y Y POWER SAVER +
+ +####DCVS Duty Cycle +DCVS duty cycle mode is for periodic use cases. The DCVS algorithm detects periodicity and sets the core and bus clock votes as per active and idle duration. This helps in saving the power to great extent by reducing idle leakage current while keeping the performance intact. + +Below example illustrates DCVS duty cycle working for an application with 30FPS activity and TURBO_PLUS votes for core and bus clocks. + +For this application run, core, bus clocks and related DSP metrics with and without DCVS duty cycle mode are shown below. In no duty cycle case, core and bus clocks are at TURBO_PLUS throughout the application run. +In DCVS duty cycle case, the DCVS algorithm detects periodicity in use case and sets core and bus clocks to TURBO_PLUS in activity time and to LOW SVS (SVS2) during idle time of each frame. + +![screenshot](../../images/DCVS_CoreClock_DutyCycle.png) + +![screenshot](../../images/DCVS_BusClock_DutyCycle.png) + +With increasing processing capabilities, active time for applications will improve resulting in greater power savings for periodic activities with DCVS duty cycle mode due to increased idle time. + +The DCVS duty cycle mode is supported starting with Lahaina. On chipsets prior Lahaina, DCVS fallsback to power saver mode on selecting duty cycling. + +####DCVS Duty Cycle Modes +Starting with Waipio, DCVS duty cycle mode is further expanded to cover following scenarios/sub-modes. + +####Fixed Corners Mode +Fixed active and idle clock corners: +* Client decides fixed active clock and idle clock +* DCVS only uses those selected corners + +Example: +* Max corner : HAP_DCVS_VCORNER_DISABLE +* Target corner : TURBO +* Min corner : LOW SVS (SVS2) +* Mode : Duty_cycle +* DCVS Enable flag: 0 +* Expectation : Duty cycle between TUBRO and LOW SVS (SVS2) only + +![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle_fixed_corners_mode.png) + +####Active Range Mode +Client and the DCVS algorithm decides active clock corners: +* Client decides active clock range and idle clock +* The DCVS algorithm decides active corner within provided range based on power vs performance tradeoff and user given max active time (if provided) + +Example: +* Max corner : TURBO +* Target corner : SVS PLUS +* Min corner : LOW SVS (SVS2) +* Mode : Duty_cycle +* DCVS Enable flag: 1 +* Expectation : Active clock is decided by the DCVS algorithm within the provided range (Target corner, max corner) +* The DCVS algorithm starts with client provided max corner for active clock, tunes it based on performance vs power tradeoff and user given max active time. + +![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle_active_range_mode.png) + +####Full DCVS Control Mode +DCVS decides active and idle clock corners: +* Client does not provide active and idle clock corner +* The DCVS algorithm can decide any clock corner for active and idle durations based on power vs performance tradeoff and user given max active time (if provided) +* Max corner : HAP_DCVS_VCORNER_DISABLE +* Target corner : HAP_DCVS_VCORNER_DISABLE +* Min corner : HAP_DCVS_VCORNER_DISABLE +* Mode : Duty_cycle +* DCVS Enable flag: 1 +* Expectation : Active and idle clocks are decided by the DCVS algorithm +* DCVS starts with NOM as active corner and LOW SVS (SVS2) as idle corner and later tunes it based on performance vs power tradeoff and user given max active time (if provided). +* DCVS picks LOW SVS (SVS2) clock corner when there is no activity. + +![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle_full_dcvs_control_mode.png) + +####DCVS Duty Cycle Helper APIs +Starting Waipio, the DCVS duty cycle helper APIs are added for ease of configuration. See [HAP_dcvs.h](../../doxygen/HAP_dcvs/index.html) for more information. + +####Sleep latency {#sleep-latency} +'set_latency' and 'latency' parameters of structure dcvs_v2 can be used to request for a sleep latency in micro seconds. + + +
set_latency FALSE No sleep latency request from the client. If FALSE then default sleep latency vote of 65535 micro seconds will be considered. +
TRUE Client request for a sleep latency is valid and desired latency is provided in latency field. +
latency Sleep latency request in micro-seconds. +
+ +Similarly 'set_latency' and 'latency' parameters of structure dcvs_v3 can be used to request for a sleep latency in micro seconds. + + +
set_latency FALSE No sleep latency request from the client. If FALSE then the latency field will be ignored. +
TRUE Client request for a sleep latency is valid and desired latency is provided in latency field. +
latency Sleep latency request in micro-seconds. +
+ +NOTE: HAP_power_set provides below possible ways for voting for sleep latency: + +1. via HAP_power_set_mips_bw request type: +~~~{.c} +/* For sleep latency */ +mips_bw.set_latency = TRUE; +mips_bw.latency = +~~~ +2. via HAP_power_set_DCVS_v2 request type: +~~~{.c} +/* For sleep latency */ +dcvs_v2.set_latency = TRUE; +dcvs_v2.latency = +~~~ + Or via HAP_power_set_DCVS_v3 request type: +~~~{.c} +/* For sleep latency */ +dcvs_v3.set_latency = TRUE; +dcvs_v3.latency = +~~~ + +Clients should use only 1 of the above methods to vote for latency i.e, either via mips_bw or via dcvs_v2/dcvs_v3 but not both. Voting via dcvs_v2/dcvs_v3 does NOT cancel any previous vote done via mips_bw and vice versa. + +latency value can be set to a minimum of 10 micro-second. The Application should vote for a latency that is tolerable. For latency critical applications, the latency can be set to its minimum value of 10 micro-second. + + +####DCVS params +set_dcvs_params and dcvs_params parameters of dcvs_v2 can be used to update DCVS thresholds and target corner vote. +set_core_params and core_params parameters of dcvs_v3 can be used to update DCVS thresholds and target corner vote for core clock. Similarly set_bus_params and bus_params parameters for bus clock. +This structure is valid irrespective of chosen dcvs_enable and dcvs_option values. Client can request for a target_corner even when the dcvs_enable option is set to FALSE. + +When set_dcvs_params/set_core_params/set_bus_params is TRUE, target_corner, min_corner and max_corner parameters of dcvs_params/core_params/bus_params can take one of the value in ::HAP_dcvs_voltage_corner_t; + + +
HAP_dcvs_voltage_corner_t Description +
HAP_DCVS_VCORNER_DISABLE No specific corner request (No Vote) +
HAP_DCVS_VCORNER_SVS2 SVS2 / LOW SVS corner +Note: On targets that don't support this voltage corner, this option will be interpreted as HAP_DCVS_VCORNER_SVS +
HAP_DCVS_VCORNER_SVS SVS corner +
HAP_DCVS_VCORNER_SVS_PLUS SVS Plus corner +Note: On targets that don't support this voltage corner, this option will be interpreted as HAP_DCVS_VCORNER_SVS +
HAP_DCVS_VCORNER_NOM NOMINAL corner +
HAP_DCVS_VCORNER_NOM_PLUS NOMINAL Plus corner +Note: On targets that don't support this voltage corner, this option will be interpreted as HAP_DCVS_VCORNER_NOM +
HAP_DCVS_VCORNER_TURBO TURBO corner +
HAP_DCVS_VCORNER_TURBO_PLUS TURBO Plus corner +Note: On targets released till Kailua, this option selects the clock frequencies defined under corners TURBO_PLUS and above (TURBO_L2 / L3) and falls back to TURBO when there is no clock frequency available at these corners. On targets post Kailua, this option selects clock frequencies defined under TURBO_PLUS (or TURBO when no defined frequency under TURBO_PLUS). Frequencies defined under TURBO_L2 / L3 corners can be selected via the new HAP_DCVS_VCORNER_TURBO_L2 / L3 options. +
HAP_DCVS_VCORNER_TURBO_L2 TURBO L2 corner +Note: On targets released till Kailua, this option is interpreted as HAP_DCVS_VCORNER_TURBO_PLUS. On targets post Kailua, this option selects the closest TURBO clock frequency (corresponding to HAP_DCVS_VCORNER_TURBO_PLUS / TURBO) when there is no clock frequency defined under the TURBO_L2 voltage corner. +
HAP_DCVS_VCORNER_TURBO_L3 TURBO L3 corner +Note: On targets released till Kailua, this option is interpreted as HAP_DCVS_VCORNER_TURBO_PLUS. On targets post Kailua, this option selects the closest TURBO clock frequency (corresponding to HAP_DCVS_VCORNER_TURBO_L2 / TURBO_PLUS / TURBO) when there is no clock frequency defined under the TURBO_L3 voltage corner. +
HAP_DCVS_VCORNER_MAX MAX possible corner defined for maximum performance. +
+
+ +
dcvs_params/core_params/bus_params target_corner Type: HAP_dcvs_voltage_corner_t. +Alternative to HAP_power_set_mips_bw MIPS and Bandwidth request. HAP_power_set provides 2 possible ways for voting for sleep latency and core/bus clocks. +1. via HAP_power_set_mips_bw request type: +~~~{.c} +/* For core clock */ +mips_bw.set_mips = TRUE; +mips_bw.mipsPerThread = +mips_bw.mipsTotal = +/* For bus clock */ +mips_bw.set_bus_bw = TRUE; +mips_bw.bwBytePerSec = +mips_bw.busbwUsagePercentage = +/* For sleep latency */ +mips_bw.set_latency = TRUE; +mips_bw.latency = +~~~ +2. via HAP_power_set_DCVS_v2 request type: +~~~{.c} +/* For core and bus clock */ +dcvs_v2.set_dcvs_params = TRUE; +dcvs_v2.dcvs_params.target_corner = +/* For sleep latency */ +dcvs_v2.set_latency = TRUE; +dcvs_v2.latency = +~~~ +or + +3. via HAP_power_set_DCVS_v3 request type: +~~~{.c} +/* For core clock */ +dcvs_v3.set_core_params = TRUE; +dcvs_v3.core_params.target_corner = +/* For bus clock */ +dcvs_v3.set_bus_params = TRUE; +dcvs_v3.bus_params.target_corner = +/* For sleep latency */ +dcvs_v3.set_latency = TRUE; +dcvs_v3.latency = +~~~ +Client can request core and bus clock to run at at a particular voltage corner instead of providing MIPS and Bandwidth (bytes per second) requests. DCVS will convert the requested voltage corner value to appropriate core clock and bus clock votes and forwards the request to the power manager on client's behalf. Clients should use only 1 of the above methods to vote i.e, either via mips_bw or via dcvs_v2/dcvs_v3 but not both. Voting via dcvs_v2/dcvs_v3 does NOT cancel any previous vote done via mips_bw and vice versa. If one would like to switch between these 2 methods, cancel any previous vote done via the other method before requesting. + +When target_corner = HAP_DCVS_VCORNER_DISABLE (No vote), DSP DCVS doesn't request for any core or bus clocks at the time of API call and it's client's responsibility to vote for core and bus clocks using HAP_power_set_mips_bw type request type. + +If enabled > HAP_DCVS_VCORNER_DISABLE, DSP DCVS logic will pick the highest available frequency plan for both core and bus clocks at the given voltage corner and requests for these clock frequencies synchronously in the API context on client's behalf. When the HAP_power_set API returns with success, core and bus clock frequencies would be set by DSP DCVS on a valid target_corner request. + +
min_corner Type: HAP_dcvs_voltage_corner_t. + +If disabled, min_corner == HAP_DCVS_VCORNER_DISABLE, the lower threshold/minimum value that DCVS can correct the clock will remain unchanged. If enabled > HAP_DCVS_VCORNER_DISABLE, DSP DCVS picks the lowest core clock frequency at the given voltage corner and uses it as the lower threshold/minimum value that DCVS can correct the clock to, irrespective of the dcvs_option selected. + +min_corner should always be less than or equal to target_corner and max_corner unless they are disabled HAP_DCVS_VCORNER_DISABLE. + +For clients requesting dcvs_enable as FALSE and using target_corner, min_corner should be equal to target_corner. + +
max_corner Type: HAP_dcvs_voltage_corner_t. + +If disabled, max_corner == HAP_DCVS_VCORNER_DISABLE, the upper threshold/maximum value that DCVS can correct the clock will remain unchanged. Typically, that would be HAP_DCVS_VCORNER_MAX in this case. If enabled > HAP_DCVS_VCORNER_DISABLE, DSP DCVS picks the highest core and bus clock frequencies at the given voltage corner and uses it as the upper threshold/maximum value that DCVS can correct the clocks to, irrespective of the dcvs_option selected. + +DSP DCVS logic overrides the max_corner vote from a client to MAX in presence of a concurrency. Concurrency is defined as a scenario where 2 or more FastRPC dynamic loaded clients are active or active Audio/Voice sessions with MPPS load greater than a pre-defined threshold. + +max_corner should always be greater than or equal to target_corner and min_corner votes, or, should be disabled HAP_DCVS_VCORNER_DISABLE. + +
param1 Type: HAP_dcvs_voltage_corner_t. + +NOTE: Set this option to HAP_DCVS_VCORNER_DISABLE unless required. + +This parameter allows user to set CPU L3 clock frequency to the requested corner. Valid only on CDSP subsystem in targets with CPU L3 cache and IO-coherency enabled (SDM845, SDM710, SM8150...), ignored elsewhere. On CDSP, based on the requested target_corner, CPU L3 clock vote from CDSP is set to a balanced level (with minimal power impact) to start with and DCVS (if enabled) increases the vote based on need to attain higher performance. This option is useful to peg CPU L3 clock at a higher level (at the cost of higher power) than that of the default balanced vote and that of the DCVS algorithm votes. This option is for advanced users and should be configured to default (HAP_DCVS_VCORNER_DISABLE) unless there is a need to explicitly set CPU L3 clock frequency based on performance and power analysis/characterization + +
param2 Reserved. +
param3 Reserved. +
+ +####Clock frequency level selection at given target corner +By default DCVS picks the highest available frequency for a given core/bus clock target corner. On latest chipsets(released after Palima), APIs are added to allow the user to specify frequency level (highest/lowest) for given core/bus clock target corner. See [HAP_dcvs.h](../../doxygen/HAP_dcvs/index.html) for more information. + +####DCVS vote aggregation logic in case of concurrency +Following logic explains the aggregation logic for min and target corner votes when there are multiple requesting clients: +~~~{.c} +DCVS min_corner vote = MAX (min_corner vote client 1, client 2, ...) +DCVS target_corner vote = MAX (target_corner vote client 1, client 2, ...) +~~~ +The following scenarios are treated as a concurrency in DCVS vote aggregation logic where DCVS max corner vote is set to TURBO by DCVS: +* More than 1 active HAP client with or without active Audio/Voice clients. +* One active HAP client and active Audio/Voice clients with MPPS load greater than a pre-defined threshold. +~~~{.c} + DCVS max_corner vote = HAP_DCVS_VCORNER_MAX +~~~ + +Note that DCVS overrides client's MAX corner vote to MAX to accommodate any concurrency requirement. DCVS MAX vote of MAX doesn't necessarily mean that DCVS will push the vote to MAX corner; MAX corner vote just sets the upper threshold for DCVS vote logic. DCVS will only bump up the clocks on need basis based on selected DCVS option. + +####Sleep Disable {#sleep_disable} +'set_sleep_disable' and 'sleep_disable' parameters of dcvs_v3 structure enables user to select low-power mode (LPM) in DSP. + +In general, applications are expected to vote for their latency tolerance via the [latency](#sleep-latency) parameter in dcvs_v3/dcvs_v2 options. The aggregated latency vote across clients is used in selecting appropriate low-power mode (LPM) of the DSP subsystem. LPM will save power when the DSP subsystem is idle by reducing leakage current. Deeper LPMs typically have higher wake up latencies, which will increase interrupt service delays and add to inter-processor communication latencies. Though the latency vote controls the selection of low-power modes, the vote required for disabling/allowing certain LPMs is difficult to calculate as the wakeup latency associated with these LPMs could change from chipset to chipset and between runs within the same chipset. + +This 'sleep_disable' parameter in dcvs_v3 allows user to directly prevent certain LPM levels of the DSP subsystem. By default, there is no restriction placed on LPMs i.e. all the LPMs are enabled and the aggregated latency vote (along with other system parameters) is used in LPM selection. The 'sleep_disable' parameter in dcvs_v3 is for the advanced developers who would like to disable certain low-power modes explicitly irrespective of the latency vote. Developers need to consider their power-performance tradeoff requirements and if necessary profile the results before voting using this parameter. Regular users are suggested to choose the default i.e. 'HAP_DCVS_LPM_ENABLE_ALL'. + +If any particular LPM level is not supported on the DSP subsystem then it will enable nearest shallow LPM level. For example, in absense of 'HAP_DCVS_LPM_LEVEL3' it will select +'HAP_DCVS_LPM_LEVEL2' which is nearest shallow LPM level to 'HAP_DCVS_LPM_LEVEL3'. + + +
set_sleep_disable FALSE No low-power mode request from the client. If FALSE then the sleep_disable field will be ignored. +
TRUE Client request for low-power mode is valid and desired option is provided in sleep_disable field. +
sleep_disable HAP_DCVS_LPM_LEVEL1 To disable sleep/low-power modes. +
HAP_DCVS_LPM_LEVEL2 To enable only standalone APCR. +
HAP_DCVS_LPM_LEVEL3 To enable RPM assisted APCR. +
HAP_DCVS_LPM_ENABLE_ALL To enable all low-power modes (enables full power collapse). +
+ +***NOTE:*** Till Palima, only HAP_DCVS_LPM_LEVEL1 and HAP_DCVS_LPM_ENABLE_ALL are supported. + +####Illustrations (DCVS_V2) +NOTE: +For working example, refer `$HEXAGON_SDK_ROOT\examples\common\benchmark_v65` application; See benchmark_setClocks() in src_dsp\benchmark_imp.c + +1. Requirement: Enable DCVS in PERFORMANCE mode, set sleep latency to 1000 micro-seconds, vote NOM in Target with SVS as Min and TURBO as Max. +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v2; +request.dcvs_v2.dcvs_enable = TRUE; +request.dcvs_v2.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; +request.dcvs_v2.set_latency = TRUE; +request.dcvs_v2.latency = 1000; +request.dcvs_v2.set_dcvs_params = TRUE; +request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_SVS; +request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_TURBO; +request.dcvs_v2.dcvs_params.target_corner = HAP_DCVS_VCORNER_NOM; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +... +/* + * Processing block + */ +... +//To remove the vote +memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes. +request.type = HAP_power_set_DCVS_v2; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +2. Requirement: Disable DCVS; do NOT vote for any corners/latency +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v2; +request.dcvs_v2.dcvs_enable = FALSE; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +3. Requirement: Enable DCVS in Power saver mode. Do NOT vote for any target corner/latency, but set MIN and MAX thresholds to DCVS to SVS and TURBO respectively. Clock voting will be done via HAP_power_set_mips_bw request. +~~~{.c} +//Vote + +/* Populate request structure with dcvs_v2 request*/ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v2; +request.dcvs_v2.dcvs_enable = TRUE; +request.dcvs_v2.dcvs_option = HAP_DCVS_V2_POWER_SAVER_MODE; +request.dcvs_v2.set_dcvs_params = TRUE; +request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_SVS; +request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_TURBO; +request.dcvs_v2.dcvs_params.target_corner = HAP_DCVS_VCORNER_DISABLE; //no vote +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +/* Populate request structure with mips_bw request */ +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); +request.type = HAP_power_set_mips_bw; +request.mips_bw.set_mips = TRUE; +request.mips_bw.mipsPerThread = 150; +request.mips_bw.mipsTotal = 600; +request.mips_bw.set_bus_bw = TRUE; +request.mips_bw.bwBytesPerSec = 10*1000*1000; +request.mips_bw.busbwUsagePercentage = 50; +request.mips_bw.set_latency = TRUE; +request.mips_bw.latency = 1000; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); // Core and bus clocks will be set by this request. +... +/* + * Processing block + */ +... +//To remove the dcvs_v2 vote +memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes. +request.type = HAP_power_set_DCVS_v2; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +//To remove the mips_bw vote +memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes +request.type = HAP_power_set_mips_bw; +request.mips_bw.set_mips = TRUE; +request.mips_bw.set_bus_bw = TRUE; +request.mips_bw.set_latency = TRUE; +request.mips_bw.latency = 65535; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +4. Requirement: Enable DCVS in DUTY CYCLE mode, vote TURBO in Target with SVS as Min. +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v2; +request.dcvs_v2.dcvs_enable = TRUE; +request.dcvs_v2.dcvs_option = HAP_DCVS_V2_DUTY_CYCLE_MODE; +request.dcvs_v2.set_latency = TRUE; +request.dcvs_v2.latency = 1000; +request.dcvs_v2.set_dcvs_params = TRUE; +request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_SVS; +request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_TURBO; +request.dcvs_v2.dcvs_params.target_corner = HAP_DCVS_VCORNER_TURBO; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +... +/* + * Processing block + */ +... +//To remove the vote +memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes. +request.type = HAP_power_set_DCVS_v2; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +####Illustrations (DCVS_V3) + + +1. Requirement: Enable DCVS in POWER SAVER mode, set sleep latency to 1000 micro-seconds, vote NOM in Target with SVS as Min and TURBO as Max for core clock, vote TURBO in Target with NOM as Min and TURBO PLUS as Max for bus clock. Later change bus clock vote as SVS_PLUS in Target with SVS as Min and NOM as Max. +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.dcvs_enable = TRUE; +request.dcvs_v3.dcvs_option = HAP_DCVS_V2_POWER_SAVER_MODE; +request.dcvs_v3.set_latency = TRUE; +request.dcvs_v3.latency = 1000; +request.dcvs_v3.set_core_params = TRUE; +request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_SVS; +request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO; +request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_NOM; +request.dcvs_v3.set_bus_params = TRUE; +request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_NOM; +request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS; +request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_TURBO; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +... +/* + * Processing block 1 + */ +... +//To update bus clock votes while keeping core clock and other parameters of dcvs_v3 request intact. +memset(&request, 0, sizeof(HAP_power_request_t)); +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_bus_params = TRUE; +request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS; +request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_NOM; +request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_SVS_PLUS; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +... +/* + * Processing block 2 + */ +... +//To remove the vote +memset(&request, 0, sizeof(HAP_power_request_t)); +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.set_latency = TRUE; +request.dcvs_v3.latency = 65535; +request.dcvs_v3.set_core_params = TRUE; +request.dcvs_v3.set_bus_params = TRUE; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +2. Requirement: Enable DCVS in PERFORMANCE mode, vote TURBO in Target with NOM as Min and TURBO PLUS as Max for core clock, do NOT vote for latency and bus clock. +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.dcvs_enable = TRUE; +request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; +request.dcvs_v3.set_core_params = TRUE; +request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_NOM; +request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS; +request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_TURBO; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +... +/* + * Processing block + */ +... +//To remove the vote +memset(&request, 0, sizeof(HAP_power_request_t)); +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.set_core_params = TRUE; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +3. Requirement: Disable DCVS; do NOT vote for any corners/latency. +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.dcvs_enable = FALSE; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_seti(ctx, &request); +~~~ + +4. Requirement: Disable sleep (all low power modes) and re-enable it after task completion. +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_sleep_disable = TRUE; +request.dcvs_v3.sleep_disable = HAP_DCVS_LPM_LEVEL1; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +... +/* + * Processing block + */ +... +//To re-enable sleep. +memset(&request, 0, sizeof(HAP_power_request_t)); +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_sleep_disable = TRUE; +request.dcvs_v3.sleep_disable = HAP_DCVS_LPM_ENABLE_ALL; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +5. Requirement: Enable DCVS in PERFORMANCE mode. Do NOT vote for any target corner/latency, but set MIN and MAX DCVS thresholds for core clock to NOM and TURBO respectively, set MIN and MAX DCVS thresholds for bus clock to SVS and NOM respectively. Clock voting will be done via HAP_power_set_mips_bw request. +~~~{.c} +//Vote + +/* Populate request structure with dcvs_v3 request*/ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.dcvs_enable = TRUE; +request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; +request.dcvs_v3.set_core_params = TRUE; +request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_NOM; +request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO; +request.dcvs_v3.set_bus_params = TRUE; +request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS; +request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_NOM; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +/* Populate request structure with mips_bw request */ +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); +request.type = HAP_power_set_mips_bw; +request.mips_bw.set_mips = TRUE; +request.mips_bw.mipsPerThread = 150; +request.mips_bw.mipsTotal = 600; +request.mips_bw.set_bus_bw = TRUE; +request.mips_bw.bwBytesPerSec = 10*1000*1000; +request.mips_bw.busbwUsagePercentage = 50; +request.mips_bw.set_latency = TRUE; +request.mips_bw.latency = 1000; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); // Core and bus clocks will be set by this request. +... +/* + * Processing block + */ +... +//To remove the dcvs_v3 vote +memset(&request, 0, sizeof(HAP_power_request_t)); +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.set_core_params = TRUE; +request.dcvs_v3.set_bus_params = TRUE; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +//To remove the mips_bw vote +memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes +request.type = HAP_power_set_mips_bw; +request.mips_bw.set_mips = TRUE; +request.mips_bw.set_bus_bw = TRUE; +request.mips_bw.set_latency = TRUE; +request.mips_bw.latency = 65535; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +6. Requirement: Use wrapper APIs to: Enable DCVS in POWER SAVER AGGRESSIVE mode, set sleep latency to 1000 micro-seconds, vote NOM in Target with SVS as Min and TURBO as Max for core clock, vote TURBO in Target with NOM as Min and TURBO PLUS as Max for bus clock. +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +HAP_power_set_dcvs_v3_init(&request); +retVal = HAP_power_set_dcvs_option(NULL, TRUE, HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE); +retVal = HAP_power_set_sleep_latency(NULL, 1000); +retVal = HAP_power_set_core_corner(NULL, HAP_DCVS_VCORNER_NOM, HAP_DCVS_VCORNER_SVS, HAP_DCVS_VCORNER_TURBO); +retVal = HAP_power_set_bus_corner(NULL, HAP_DCVS_VCORNER_TURBO, HAP_DCVS_VCORNER_NOM, HAP_DCVS_VCORNER_TURBO_PLUS); +... +/* + * Processing block + */ +... +//To remove the vote +HAP_power_set_dcvs_v3_init(&request); +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ + +7. Requirement: Enable DCVS in DUTY CYCLE mode, vote TURBO_PLUS in Target with SVS as Min for core and bus clock. +~~~{.c} +//Vote + +/* Populate request structure */ +int retVal; +HAP_power_request_t request; +memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated. +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.dcvs_enable = TRUE; +request.dcvs_v3.dcvs_option = HAP_DCVS_V2_DUTY_CYCLE_MODE; +request.dcvs_v3.set_latency = TRUE; +request.dcvs_v3.latency = 1000; +request.dcvs_v3.set_core_params = TRUE; +request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_SVS; +request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS; +request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_TURBO_PLUS; +request.dcvs_v3.set_bus_params = TRUE; +request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS; +request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS; +request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_TURBO_PLUS; +/* Call HAP_power_set API with the updated request structure */ +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +... +/* + * Processing block + */ +... +//To remove the vote +memset(&request, 0, sizeof(HAP_power_request_t)); +request.type = HAP_power_set_DCVS_v3; +request.dcvs_v3.set_dcvs_enable = TRUE; +request.dcvs_v3.set_latency = TRUE; +request.dcvs_v3.latency = 65535; +request.dcvs_v3.set_core_params = TRUE; +request.dcvs_v3.set_bus_params = TRUE; +/* ctx is an unique identifier, explained [here](#usage). */ +retVal = HAP_power_set(ctx, &request); +~~~ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_process.h new file mode 100755 index 0000000000000..a7aacf0287159 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_process.h @@ -0,0 +1,31 @@ +#ifndef HAP_PROCESS_H +#define HAP_PROCESS_H +/*============================================================================== + Copyright (c) 2024 Qualcomm Technologies Incorporated. + All Rights Reserved Qualcomm Technologies Proprietary + + Export of this technology or software is regulated by the U.S. + Government. Diversion contrary to U.S. law prohibited. +==============================================================================*/ + +/** @defgroup process_type Process type + * @{ + */ +/** Return values for HAP_get_pd_type + Returns any one of the below values depending on the type of PD spawned */ +enum process_type { + ROOT_PD = 0, + AUDIO_STATIC_PD = 1, + SENSOR_STATIC_PD = 2, + DYNAMIC_SIGNED_PD = 3, + DYNAMIC_UNSIGNED_PD = 4, + DYNAMIC_CPZ_PD = 5, + SECURE_PD = 6, + DYNAMIC_SYS_UNSIGNED_PD = 7, + OIS_STATIC_PD = 8, + MAX_PD_TYPE = 9 /**< Maximum number of supported PD types */ +}; +/** + * @} // process_type + */ +#endif \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.h new file mode 100755 index 0000000000000..89eac2a080350 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.h @@ -0,0 +1,164 @@ +#ifndef HAP_PS_H +#define HAP_PS_H +/*============================================================================== + Copyright (c) 2012-2019,2024 Qualcomm Technologies Incorporated. + All Rights Reserved Qualcomm Technologies Proprietary + + Export of this technology or software is regulated by the U.S. + Government. Diversion contrary to U.S. law prohibited. +==============================================================================*/ + +#include "AEEStdDef.h" +#include "HAP_process.h" + +/** + * Maximum allowed remote process name length + */ +#define PROCESS_NAME_LEN 56 + + +/** @defgroup manage_dynamic_list Manage Dynamic List. + * @{ + */ + +typedef struct HAP_process HAP_process; +struct HAP_process { + char name[PROCESS_NAME_LEN]; + int32 asid; + int32 hlos_pid; +}; + +/** + * Get list of active processes + * @param[out] num_processes : Number of active processes + * @param[out] processes : Pointer to the list of processes + * @return 0 on success, valid non-zero error code on failure + */ +int HAP_get_process_list(uint32* num_processes, HAP_process** processes); + +/** + * Add new entry to process list + * @param[in] process : Pointer to node to be added to the process list + * @return 0 on success, valid non-zero error code on failure + */ +int HAP_add_to_process_list(HAP_process* process); + +/** + * Remove entry from process list + * @param[in] hlos_pid : HLOS process ID of entry to be removed from the process list + * @return 0 on success, valid non-zero error code on failure + */ +int HAP_remove_from_process_list(int hlos_pid); + +/** + * Set name of current process + * @param[in] name : Name of process + * @return 0 on success, valid non-zero error code on failure + */ +int HAP_set_process_name(char *name); + +/** + * API deprecated from SM8150 onwards. + */ +int HAP_thread_migrate(int tidQ); + +/** + * @} + */ + + +/** @defgroup early_wakeup Signal early wakeup + * @{ + */ + + +/** Send signal to CPU for early wake up + * + * Send signal to CPU for early wake up with approximate time to complete the job. + * This signal helps to reduce FastRPC latency. + * + * Args: + * @param[in] tidQ : QuRT thread id of a skel invoke thread. Use qurt_thread_get_id() + * to retrieve the thread ID. + * @param[in] earlyWakeTime : approximate time (in us) to complete job after sending the signal + * Returns: 0 on success, valid non-zero error code on failure + */ +int HAP_send_early_signal(uint32_t tidQ, uint32_t earlyWakeTime); + +/** + * API deprecated from Lahaina onwards. Use HAP_send_early_signal() instead + */ +int fastrpc_send_early_signal(uint32_t tidQ, uint32_t earlyWakeTime); + +/** + * @} + */ + + + +/** @defgroup thread_priority_ceiling Enquire thread priority ceiling + * @{ + */ + + +/** Return the ceiling thread priority for the current process + * + * Return the thread priority ceiling for the current process. QuRT thread priorities + * run from 1 to 255, with 1 being the highest. Unprivileged user processes will + * have a ceiling priority of 64. + * + * Args: None + * Returns: Thread priority ceiling value (bet 1 & 255) on success, -1 on failure + */ +int HAP_get_thread_priority_ceiling(void); + +/** + * Identifies the HAP request user pd parameters type + * @param HAP_req_get_orig_apps_pid : Returns the process original apps pid. + */ +typedef enum { + HAP_req_get_orig_apps_pid = 1, +} HAP_req_userpd_params_type; + +/** + * Data type to get requested value from the DSP + * @param type : Identifies the request type. + * @param orig_apps_pid : Returns the process original apps pid. + */ +typedef struct { + HAP_req_userpd_params_type type; + union { + int orig_apps_pid; + }; +} HAP_req_userpd_params_t; + +/** + * Method to retrieve user process values from the DSP. This API support from SM8750 onwards. + * @param [in] request : Request params. + * @return Returns 0 for success, error code on failure. + */ +int HAP_get_userpd_params(HAP_req_userpd_params_t *request); + +/** + * @} + */ + +/** @defgroup HAP_get_pd_type Query the PD type of the process + * @{ + */ + + +/** Function to get PD type of the spawned process + * + * Args: + * @param[out] pd_type : Pointer to enum process_type to get PD type + * @return 0 on success, valid non-zero error code on failure + */ +int HAP_get_pd_type(enum process_type* pd_type); + +/** + * @} + */ + +#endif /*HAP_PS_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.md new file mode 100755 index 0000000000000..eed59f7adb3ce --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.md @@ -0,0 +1,33 @@ +# Introduction {#intro} + +These APIs allow a user to perform the following actions: +* Manage the dynamic list of processes running on the current DSP +* Send a wakeup call to the CPU in order to decrease its response time upon returning from a FastRPC call +* Enquire about the thread priority ceiling for the current process + + +## API Overview {#api-overview} + +The HAP_ps.h APIs include the following functions: + +* ::HAP_get_process_list + +* ::HAP_add_to_process_list + +* ::HAP_remove_from_process_list + +* ::HAP_set_process_name + +* ::HAP_thread_migrate + +* ::HAP_send_early_signal + +* ::fastrpc_send_early_signal + +* ::HAP_get_thread_priority_ceiling + +* ::HAP_get_userpd_params + +* ::HAP_get_pd_type + +Header file: @b HAP_ps.h diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_traceme.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_traceme.h new file mode 100755 index 0000000000000..a46c56a835479 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_traceme.h @@ -0,0 +1,28 @@ +#ifndef HAP_TRACEME_H +#define HAP_TRACEME_H +/*============================================================================== + Copyright (c) 2012-2013 Qualcomm Technologies Incorporated. + All Rights Reserved Qualcomm Technologies Proprietary + + Export of this technology or software is regulated by the U.S. + Government. Diversion contrary to U.S. law prohibited. +==============================================================================*/ + +#include "AEEStdDef.h" +#include "HAP_debug.h" + +#if defined(_DEBUG) + +static __inline void HAP_traceme(void) +{ + (void)HAP_debug_ptrace(HAP_DEBUG_TRACEME, 0, 0, 0); +} + +#else /* #if defined(_DEBUG) */ + +#define HAP_traceme() + +#endif /* #if defined(_DEBUG) */ + +#endif /* #ifndef HAP_TRACEME_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.h new file mode 100755 index 0000000000000..cde225749d3c8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.h @@ -0,0 +1,221 @@ +/*----------------------------------------------------------------------------- + Copyright (c) 2019-2020 QUALCOMM Technologies, Incorporated. + All Rights Reserved. + QUALCOMM Proprietary. +-----------------------------------------------------------------------------*/ + +#ifndef HAP_USER_PMU_H_ +#define HAP_USER_PMU_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file HAP_user_pmu.h + * @brief HAP user PMU API + */ + +/** @defgroup Constants constants + * @{ + */ + +/** Error value for unsupported APIs. */ +#define HAP_USER_PMU_READ_NOT_SUPPORTED 0x80000FFF + +/** Error value for PMU read failure. */ +#define HAP_USER_PMU_READ_FAILED 0xDEADDEAD + +/** @} + */ + +/** @defgroup Types Data types + * @{ + */ + +/** + * Input parameter type used when a group of PMU events must be read via + * HAP_register_pmu_group(), HAP_read_pmu_group() and HAP_deregister_pmu_group(). + + * The user must fill in the pmu_events[] array field of this structure with the + * specified PMU events to track and update the num_events field with the number + * of events to track. Only four unique PMU events can be tracked. + */ +typedef struct { + int contextId; + /**< Return value after registering the PMU group via HAP_register_pmu_group. */ + + unsigned int num_events; + /**< Input parameter specifying the number of PMU events register.*/ + + unsigned short pmu_events[4]; + /**< Input parameter specifying the list of PMU events to register.*/ + + unsigned int pmu_value[4]; + /**< Output parameter containing values of PMU events registered. */ +} HAP_pmu_group_config_t; + +/** @} + */ + +/** + * @cond DEV + */ +int __attribute__((weak)) __HAP_register_pmu_group(HAP_pmu_group_config_t* pmu_config); +int __attribute__((weak)) __HAP_deregister_pmu_group(int contextId); +int __attribute__((weak)) __HAP_read_pmu_group(HAP_pmu_group_config_t* pmu_config); +int __attribute__((weak)) __HAP_register_pmu_event(unsigned short pmu_event); +int __attribute__((weak)) __HAP_deregister_pmu_event(unsigned short pmu_event); +unsigned int __attribute__((weak)) __HAP_read_pmu_event(unsigned short pmu_event); + +/** + * @endcond + */ + +/** @defgroup GroupFunc API for reading a group of PMUs + * These APIs expose a way to register and read an array of PMU events + * (maximum of four PMU events) by using the #HAP_pmu_group_config_t structure. + * Alternatively, the user can use a different set of APIs explained in the next + * section to configure and read a single PMU event. + * @{ + */ + +/** + * Registers a group of PMU events to read. + * + * Call this function from the DSP user process to register a set of PMU events + * (maximum of four) for tracking. Fill in the pmu_events[] array file of + * @p pmu_config with the specified PMU events to track (maximum of four) and + * update the num_events field of @p pmu_config with the number of PMU events + * written into the pmu_events[] array. + * + * @param pmu_config Pointer to HAP_pmu_group_config_t structure with + * pmu_events[] array and num_events fields updated. + * + * @return 0 upon success. Updates the contextId field of @p pmu_config. + * @par + * The same pmu_config structure should be used for reading the PMU + * counter values #HAP_read_pmu_group() corresponding to the + * configured events and for de-registration #HAP_deregister_pmu_group(). + */ +static inline int HAP_register_pmu_group(HAP_pmu_group_config_t* pmu_config) { + if(__HAP_register_pmu_group) + return __HAP_register_pmu_group(pmu_config); + + return HAP_USER_PMU_READ_NOT_SUPPORTED; +} + +/** + * Reads the PMU values of registered PMU events. + * + * Call this function after successfully calkling HAP_register_pmu_group() with the + * same structure pointer, @p pmu_config. + * This API uses the context_id field of the input @p pmu_config + * structure, which is set in a successful HAP_register_pmu_group(). + * + * @param pmu_config Pointer to the #HAP_pmu_group_config_t structure used in + * #HAP_register_pmu_group() call. + * @return + * 0 upon success. Updates the pmu_value[] array corresponding to the + * configured pmu_events[] in the structure pointed to by @p pmu_config. + * pmu_value[x] is updated to HAP_USER_PMU_READ_FAILED if the corresponding pmu_event[x] + * configuration has failed or is invalid. + * @par + * Other values upon failure. \n + * @par + * #HAP_USER_PMU_READ_NOT_SUPPORTED when unsupported. + */ +static inline int HAP_read_pmu_group(HAP_pmu_group_config_t* pmu_config) { + if(__HAP_read_pmu_group) + return __HAP_read_pmu_group(pmu_config); + + return HAP_USER_PMU_READ_NOT_SUPPORTED; +} + +/** + * De-registers a group of PMU events registered via HAP_register_pmu_group(). + * + * @param pmu_config Pointer to the #HAP_pmu_group_config_t structure used in the + * HAP_register_pmu_group() call. + + * @return + * 0 upon success. \n + * Other values upon failure. + */ +static inline int HAP_deregister_pmu_group(HAP_pmu_group_config_t* pmu_config) { + if(__HAP_deregister_pmu_group) + return __HAP_deregister_pmu_group(pmu_config->contextId); + + return HAP_USER_PMU_READ_NOT_SUPPORTED; +} + +/** + * @} + */ + +/** @defgroup singleFunc API for reading single PMU event + * These APIs allow the user to configure and read single PMU events. + * PMU event is used as an input in register, read and de-register APIs. + * Up to four unique PMU event requests can be served. + * @{ + */ + +/** + * Registers sa PMU event for read. + * + * @param pmu_event PMU event to register. + * + * @return + * 0 upon success. \n + * Other values upon failure. + */ +static inline int HAP_register_pmu_event(unsigned short pmu_event) { + if(__HAP_register_pmu_event) + return __HAP_register_pmu_event(pmu_event); + + return HAP_USER_PMU_READ_NOT_SUPPORTED; +} + +/** + * Reads the PMU event registered via HAP_register_pmu_event(). + * + * @param pmu_event PMU event to read. Should already be registered via + * HAP_register_pmu_event(). + * + * @return + * The value of the PMU counter corresponding to the pmu_event. \n + * - HAP_USER_PMU_READ_NOT_SUPPORTED -- API is unsupported. \n + * - HAP_USER_PMU_READ_FAILED -- The given @p pmu_event read fails. + */ +static inline unsigned int HAP_read_pmu_event(unsigned short pmu_event) { + if(__HAP_read_pmu_event) + return __HAP_read_pmu_event(pmu_event); + + return HAP_USER_PMU_READ_NOT_SUPPORTED; +} + +/** + * De-registers the PMU event registered via HAP_register_pmu_event(). + * + * @param pmu_event PMU event to de-register. It should already be registered + * via #HAP_register_pmu_event(). + * + * @return + * 0 upon success. \n + * Other values upon failure. \n + * HAP_USER_PMU_READ_NOT_SUPPORTED when not supported. + */ +static inline int HAP_deregister_pmu_event(unsigned short pmu_event) { + if(__HAP_deregister_pmu_event) + return __HAP_deregister_pmu_event(pmu_event); + + return HAP_USER_PMU_READ_NOT_SUPPORTED; +} + +/** @} + */ + +#ifdef __cplusplus +} +#endif +#endif /*HAP_USER_PMU_H_*/ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.md new file mode 100755 index 0000000000000..1deab8f3af81d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.md @@ -0,0 +1,26 @@ +# Performance monitoring unit + +The DSP subsystem has the PMU (Performance Monitoring Unit) with counters to track +hardware events (called PMU events). The HAP PMU framework exposes a +set of APIs to read these PMU counters configured with specified PMU events. PMU +events are Hexagon DSP architecture specific and the most common PMU events are briefly +described in the Hexagon DSP architecture documentation. +The [itrace](../../doxygen/itrace/index.html) library's header file `itrace_dsp_events_pmu.h` +provides a complete list of all available public PMU events alongside their descriptions. + +***NOTE:*** +* aDSP and cDSP DCVS relies on a set of PMU events to monitor DSP +statistics and make necessary decisions. Using these HAP APIs to register PMU +events results in DCVS no longer being able to track these events. This might +lead DCVS to making incorrect decisions. +* HAP PMU APIs only work on [debug-enabled](../../tools/sign.html#test-device) devices. + +The HAP PMU APIs are not accessible from unsigned PD. + +## Supported chipsets + +SM8250 and beyond + +## Framework APIs + +Header file: @b HAP_user_pmu.h diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.h new file mode 100755 index 0000000000000..dbd66f0bf88d6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.h @@ -0,0 +1,214 @@ +/*----------------------------------------------------------------------------- + * Copyright (c) 2016-2020 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. +-----------------------------------------------------------------------------*/ + +#ifndef HAP_VTCM_MGR_H_ +#define HAP_VTCM_MGR_H_ + +#ifdef __cplusplus +extern "C" { +#endif + + +void* __attribute__((weak)) HAP_request_async_VTCM(unsigned int size, unsigned int single_page_flag, unsigned int timeout_us); + +/** + * @defgroup vtcmapi HAP VTCM manager API. + * This section describes the HAP VTCM manager API to allocate and release VTCM. + * @{ + */ + +/** + * @file HAP_vtcm_mgr.h + * @brief APIs used to allocate, release, and query Vector TCM (VTCM) memory. + * VTCM is a high-performance, tightly-coupled memory in the cDSP + * subsystem. It can used for Hexagon Vector eXtensions (HVX) + * scatter/gather instructions, the Hexagon Matrix eXtension (HMX) engine + * (available in some cDSPs starting with Lahaina), or as high-performance + * scratch memory for other HVX workloads. + */ + +/** + * Request VTCM memory of a specified size and single page requirement. + * + * @param[in] size Size of the request in bytes. \n + * If (@p single_page_flag == 0), the size is aligned to 4 KB. \n + * If (@p single_page_flag == 1), the size is aligned to + * the closest possible page size: 4 KB, 16 KB, 64 KB, 256 KB, + * 1 MB, 4 MB, 16 MB. + * @param[in] single_page_flag Single page requirement for this allocation: + * 1 for single page requests, 0 otherwise. + * Single page requests are mandatory for + * scatter/gather operations because the operations + * must be contained within a single page of memory. + * (The memory region used by scatter/gather + * HVX instructions must reside in VTCM and cannot + * cross a page boundary). + * + * @return + * @c void* pointer to the allocated memory region on success. \n + * 0 on failure. + * + * @par Example + * @code + * // Request for a single page of 4000 bytes + * void *pVTCM = HAP_request_VTCM(4000, 1); + * if (0 != pVTCM) + * { + * // Allocation is successful. Try a release + * int result = HAP_release_VTCM(pVTCM); + * if (0 == result) + * { + * //Release successful + * } + * } + * @endcode + */ +void* HAP_request_VTCM(unsigned int size, unsigned int single_page_flag); + + /** + * Request VTCM memory of a specified size and single page requirement with a + * timeout option. + * + * This API can be used to wait for the provided timeout. The calling thread is + * suspended until the requested VTCM memory is available or until the timeout, + * whichever happens first. + * + * @b NOTE: A deadlock might occur when calling this API if the same + * thread holds a part of, or the entire VTCM memory prior to this call. + * This API is @a not supported from secure and CPZ PDs. + * + * @param[in] size Size of the request in bytes. \n + * If (@p single_page_flag == 0), the size is aligned to 4 KB. \n + * If (@p single_page_flag == 1), the size is aligned to + * the closest possible page size,: 4 KB, 16 KB, 64 KB, 256 KB, + * 1 MB, 4 MB, 16 MB + * @param[in] single_page_flag Single page requirement for this allocation: + * 1 for single page requests, 0 otherwise. + * Single page requests are mandatory for + * scatter/gather operations because the operations + * must be contained within a single page of memory. + * (The memory region used by scatter/gather + * instructions must reside in VTCM and cannot + * cross a page boundary). + * @param[in] timeout_us Timeout in microseconds. If the request is readily + * available, return success with a void pointer. If the + * request cannot be served, wait for the available VTCM + * memory until the timeout, or return failure on the + * timeout. This value must be greater than 200 for the + * timeout implementation to work; otherwise, it is treated + * like HAP_request_VTCM(). + * + * @return + * @c void* pointer to the allocated memory region on success. \n + * 0 on failure. + * + * @par Example + * @code + * // Request for a single page of 256 * 1024 bytes with + * // timeout set to 5 milliseconds + * void *pVTCM = HAP_request_async_VTCM(256 * 1024, 1, 5000); + * if (0 != pVTCM) + * { + * // Allocation is successful. Try a release + * int result = HAP_release_VTCM(pVTCM); + * if (0 == result) + * { + * //Release successful + * } + * } + * @endcode + */ +void* HAP_request_async_VTCM(unsigned int size, + unsigned int single_page_flag, + unsigned int timeout_us); + +/** + * Release a successful request for VTCM memory by providing the pointer + * to the previously allocated VTCM block. + * + * @param[in] pVA Pointer returned by a successful VTCM request call. + * + * @return + * @c int 0 on success. \n + * Non-zero on failure. + */ +int HAP_release_VTCM(void* pVA); + +/** + * Query for the VTCM size defined on target. + * + * @param[out] page_size Pointer to an @c unsigned @c int variable. + * If this parameter is non-zero on success, the memory + * location contains the maximum possible page size + * allocation (in bytes) in VTCM. + * @param[out] page_count Pointer to an @c unsigned @c int variable. + * If @p page_size is non-zero on success, the memory + * location contains the number of @p page_size + * blocks in VTCM. + * + * @return + * @c int 0 on success. \n + * Non-zero on failure. + * + * @par Example + * @code + * unsigned int page_size, page_count; + * if (0 == HAP_query_total_VTCM(&page_size, &page_count)) + * { + * // Query successful. + * // For SM8150 cDSP: + * // page_size will be 256 * 1024. + * // page_count will be 1. + * // VTCM memory defined for this chipset (256 KB) + * unsigned int total_vtcm = page_size * page_count; + * } + * @endcode + */ +int HAP_query_total_VTCM(unsigned int* page_size, unsigned int* page_count); + +/** + * API to query VTCM allocation status. + * + * @param[out] avail_block_size Pointer to an @c unsigned @c int variable. + * If this parameter is non-zero on success, the + * memory location contains the maximum contiguous + * memory chunk (in bytes) available in VTCM. + * @param[out] max_page_size Pointer to an @c unsigned @c int variable. + * If this parameter is non-zero, the memory location + * contains the maximum possible page size allocation + * (in bytes) in the available portion of VTCM. + * @param[out] num_pages Pointer to an @c unsigned @c int variable. + * If this parameter is non-zero on success, the memory + * location contains the value of @p max_page_size. + * + * @return + * @c int 0 on success. \n + * Non-zero on failure. + * + * @par Example + * @code + * unsigned int avail_block_size, max_page_size, num_pages; + * if (0 == HAP_query_avail_VTCM(&avail_block_size, &max_page_size, &num_pages)) + * { + * // Query successful. + * // Use avail_block_size, max_page_size, num_pages + * } + * @endcode + */ +int HAP_query_avail_VTCM(unsigned int* avail_block_size, + unsigned int* max_page_size, + unsigned int* num_pages); + +/** + * @} + */ + + +#ifdef __cplusplus +} +#endif + +#endif //HAP_VTCM_MGR_H_ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.md new file mode 100755 index 0000000000000..3d5542c05202a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.md @@ -0,0 +1,21 @@ +# VTCM manager + +Vector TCM (VTCM) is available on supported targets with cDSP. VTCM +is a high-performance, tightly-coupled memory in the cDSP subsystem that can be used +for Hexagon Vector eXtensions (HVX) scatter/gather instructions, Hexagon Matrix +eXtension (HMX)(available in some cDSPs starting with Lahaina), or as +high-performance scratch memory for other HVX workloads. + +The VTCM manager exposes APIs in from the `HAP_vtcm_mgr.h` file to allocate, free, and query the availability of VTCM. + +***NOTE:*** +Starting with Lahaina, use the [compute resource manager](../../doxygen/HAP_compute_res/index.html){target=_blank} API for VTCM allocations instead of this legacy VTCM manager API. The compute resource manager is expanded to provide user options to do the following: + +* Allocate other compute resources (including VTCM) +* Manage application IDs, which control VTCM partitions and privileges +* Send release callbacks, which can be invoked when a high priority client requires the resource +* Release and reacquire the same VTCM size and page configuration +* Request VTCM with granular sizes (minimum and maximum required) and specific page size requirements + +The VTCM manager API is restricted to allocate VTCM only from the +primary VTCM partition (if the partition is defined). diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/adsp_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/adsp_mmap.h new file mode 100755 index 0000000000000..7e5fd438b0902 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/adsp_mmap.h @@ -0,0 +1,25 @@ +#ifndef ADSP_MMAP_H +#define ADSP_MMAP_H + +#ifdef __cplusplus +extern "C" { +#endif +#include "AEEStdDef.h" +/** + * @param buf, the buffer virtual address + * @param bufLen, the length + * @param flags, the flags it was mapped with, 0 by default + */ +int adsp_addref_mmap(void* buf, int bufLen, uint32 flags); + +/** + * @param buf, the buffer virtual address + * @param bufLen, the length + */ +int adsp_release_mmap(void* buf, int bufLen); + + +#ifdef __cplusplus +} +#endif +#endif// ADSP_MMAP_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/apps_mem.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/apps_mem.h new file mode 100755 index 0000000000000..bb9ed189aa41b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/apps_mem.h @@ -0,0 +1,39 @@ +#ifndef _APPS_MEM_H +#define _APPS_MEM_H +#include "AEEStdDef.h" +#ifndef __QAIC_HEADER +#define __QAIC_HEADER(ff) ff +#endif //__QAIC_HEADER + +#ifndef __QAIC_HEADER_EXPORT +#define __QAIC_HEADER_EXPORT +#endif // __QAIC_HEADER_EXPORT + +#ifndef __QAIC_HEADER_ATTRIBUTE +#define __QAIC_HEADER_ATTRIBUTE +#endif // __QAIC_HEADER_ATTRIBUTE + +#ifndef __QAIC_IMPL +#define __QAIC_IMPL(ff) ff +#endif //__QAIC_IMPL + +#ifndef __QAIC_IMPL_EXPORT +#define __QAIC_IMPL_EXPORT +#endif // __QAIC_IMPL_EXPORT + +#ifndef __QAIC_IMPL_ATTRIBUTE +#define __QAIC_IMPL_ATTRIBUTE +#endif // __QAIC_IMPL_ATTRIBUTE +#ifdef __cplusplus +extern "C" { +#endif +__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_request_map)(int heapid, uint32 ion_flags, uint32 rflags, uint32 vin, int32 len, uint32* vapps, uint32* vadsp) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_request_unmap)(uint32 vadsp, int32 len) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_request_map64)(int heapid, uint32 ion_flags, uint32 rflags, uint64 vin, int64 len, uint64* vapps, uint64* vadsp) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_request_unmap64)(uint64 vadsp, int64 len) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_share_map)(int fd, int size, uint64* vapps, uint64* vadsp) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_share_unmap)(uint64 vadsp, int size) __QAIC_HEADER_ATTRIBUTE; +#ifdef __cplusplus +} +#endif +#endif //_APPS_MEM_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain.h new file mode 100755 index 0000000000000..62c6ecdadb0af --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2021 QUALCOMM Technologies Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + * + */ + +#include "remote.h" + +#ifdef _AUTO + #include "domain_auto.h" +#else + #include "domain_default.h" +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain_default.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain_default.h new file mode 100755 index 0000000000000..efb741af8faee --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain_default.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2021 QUALCOMM Technologies Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + * + */ + +#include "remote.h" + +domain supported_domains[] = { + {ADSP_DOMAIN_ID, ADSP_DOMAIN}, + {MDSP_DOMAIN_ID, MDSP_DOMAIN}, + {SDSP_DOMAIN_ID, SDSP_DOMAIN}, + {CDSP_DOMAIN_ID, CDSP_DOMAIN}, + {CDSP1_DOMAIN_ID, CDSP1_DOMAIN} +}; + +bool is_CDSP(int domain_id) { + return (domain_id == CDSP_DOMAIN_ID || domain_id == CDSP1_DOMAIN_ID); +} diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.h new file mode 100755 index 0000000000000..91a3a4b737b5e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.h @@ -0,0 +1,455 @@ +/* + Copyright (c) 2020 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. +*/ + + +/** @file + Asynchronous DSP Packet Queue API. +*/ + +#ifndef DSPQUEUE_H +#define DSPQUEUE_H + +#include +#include +#include + + +/** @defgroup dspqueue_consts Asynchronous DSP Packet Queue API Constants + * @{ + */ + +/** Infinite timeout */ +#define DSPQUEUE_TIMEOUT_NONE 0xffffffff + + +/** + * Packet flags. The flags are used as a bitfield in packet read/write operations. + */ +enum dspqueue_packet_flags { + DSPQUEUE_PACKET_FLAG_MESSAGE = 0x0001, /**< Packet contains a message */ + DSPQUEUE_PACKET_FLAG_BUFFERS = 0x0002, /**< Packet contains buffer references */ + DSPQUEUE_PACKET_FLAG_WAKEUP = 0x0004, /**< Early wakeup packet */ + DSPQUEUE_PACKET_FLAG_DRIVER_READY = 0x0008, /**< Packet is ready for driver consumption. Currently unused. */ + DSPQUEUE_PACKET_FLAG_USER_READY = 0x0010, /**< Packet is ready for userspace library consumption */ + DSPQUEUE_PACKET_FLAG_RESERVED_ZERO = 0xffe0 +}; + +/** + * Buffer flags. The flags are used in dspqueue_buffer.flags as a bitfield. + */ +enum dspqueue_buffer_flags { + /* 1 and 2 reserved */ + DSPQUEUE_BUFFER_FLAG_REF = 0x00000004, /**< Add a reference to a previously mapped buffer */ + DSPQUEUE_BUFFER_FLAG_DEREF = 0x00000008, /**< Remove a reference from a previously mapped buffer */ + DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER = 0x00000010, /**< Flush buffer from sender caches */ + DSPQUEUE_BUFFER_FLAG_INVALIDATE_SENDER = 0x00000020, /**< Invalidate buffer from sender caches */ + DSPQUEUE_BUFFER_FLAG_FLUSH_RECIPIENT = 0x00000040, /**< Flush buffer from recipient caches */ + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT = 0x00000080, /**< Invalidate buffer from recipient caches */ + DSPQUEUE_BUFFER_FLAG_RESERVED_ZERO = 0xffffff00 +}; + + +/** + * Statistics readable with dspqueue_get_stat() + */ +enum dspqueue_stat { + DSPQUEUE_STAT_READ_QUEUE_PACKETS = 1, /**< Numbers of packets in the read queue */ + DSPQUEUE_STAT_READ_QUEUE_BYTES, /**< Number of bytes in the read queue */ + DSPQUEUE_STAT_WRITE_QUEUE_PACKETS, /**< Number of packets in the write queue */ + DSPQUEUE_STAT_WRITE_QUEUE_BYTES, /**< Number of bytes in the write queue */ + + DSPQUEUE_STAT_EARLY_WAKEUP_WAIT_TIME, /**< Total accumulated early wakeup wait time in microseconds. + Developers can use this value to tune their early wakeup + request timing; the target should be to have this value as + close to zero as possible while minimizing signaling latency. + For more information on tuning early wakeup requests, see the + "Performance Considerations" section in the main Hexagon SDK + "Asynchronous Packet Queue" document. */ + + DSPQUEUE_STAT_EARLY_WAKEUP_MISSES /**< Number accumulated of packets missed in the early wakeup loop. + Developers can use this value to tune their early wakeup + request timing. If this value is above zero it indicates the + early wakeup request was sent too early and it expired before + the corresponding packet was received. + For more information on tuning early wakeup requests, see the + "Performance Considerations" section in the main Hexagon SDK + "Asynchronous Packet Queue" document. */ +}; + +/** @} + */ + + +/** @defgroup dspqueue_types Asynchronous DSP Packet Queue API Data Types + * @{ + */ + +struct dspqueue; +typedef struct dspqueue* dspqueue_t; /**< Queue handle */ + + +/** + * Buffer reference in a packet. + * The buffer must already be mapped to the DSP using the same file descriptor. + * The subsection of the buffer as specified by #offset and #size must fit + * entirely within the mapped buffer. + * Note that buffer references are tracked based on the buffer file descriptor, + * and taking/releasing a reference to a buffer applies to the entire buffer as + * mapped to the DSP, not just the subsection specified. + */ +struct dspqueue_buffer { + uint32_t fd; /**< Buffer file descriptor */ + uint32_t size; /**< Buffer size in bytes. The client can set this field + to zero when writing packets; in this case the + framework will set the field to the size of the + buffer as mapped. */ + uint32_t offset; /**< Offset within the buffer in bytes as allocated and mapped. + The virtual address #ptr includes the offset */ + uint32_t flags; /**< Buffer flags, see enum #dspqueue_buffer_flags */ + union { + void *ptr; /**< Buffer virtual address; NULL if not mapped in the local context */ + uint64_t address; + }; +}; + + +/** + * Callback function type for all queue callbacks + * + * @param queue Queue handle from dspqueue_create() / dspqueue_import() + * @param error Error code + * @param context Client-provided context pointer + */ +typedef void (*dspqueue_callback_t)(dspqueue_t queue, AEEResult error, void *context); + +/** @} + */ + + +#ifdef __cplusplus +extern "C" { +#endif + + +/** @defgroup dspqueue_funcs Asynchronous DSP Packet Queue API Functions + * @{ + */ + +/** + * Create a new queue to communicate with the DSP. Queues can only be + * created on the host CPU. + * + * @param [in] domain DSP to communicate with (CDSP_DOMAIN_ID in remote.h for cDSP) + * @param [in] flags Queue creation flags + * @param [in] req_queue_size Total request queue memory size in bytes; use 0 for system default + * @param [in] resp_queue_size Total response queue memory size in bytes; use 0 for system default + * @param [in] packet_callback Callback function called when there are new packets to read. + * The call will be done in a different thread's context. + * NULL to disable the callback. Clients cannot use blocking read + * calls if a packet callback has been set. + * @param [in] error_callback Callback function called on unrecoverable errors. NULL to disable. + * @param [in] callback_context Context pointer for callback functions + * @param [out] queue Queue handle + * + * @return 0 on success, error code on failure. + * - AEE_ENOMEMORY: Not enough memory available + * - AEE_EUNSUPPORTED: Message queue not supported on the given DSP + * - AEE_EBADPARM: Bad parameters, e.g. Invalid domain (use CDSP_DOMAIN_ID for cDSP), Too many queues open for the DSP in this process + * - AEE_ERPC: Internal RPC error, e.g. Queue list corrupt + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_create(int domain, + uint32_t flags, + uint32_t req_queue_size, uint32_t resp_queue_size, + dspqueue_callback_t packet_callback, + dspqueue_callback_t error_callback, + void *callback_context, + dspqueue_t *queue); + +/** + * Close a queue and free all memory associated with it. The + * function can be called on the host CPU with queue handles from + * dspqueue_create() or on the DSP with handles from + * dspqueue_import(). + * + * @param [in] queue Queue handle from dsp_queue_create() from dsp_queue_import(). + * + * @return 0 on success, error code on failure. + * - AEE_EBADPARM: Bad parameters, e.g. The queue is open on the DSP when attempting to close it on the host CPU + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_close(dspqueue_t queue); + +/** + * Export a queue to the DSP. The CPU-side client calls this function, + * passes the ID to the DSP, which can then call dspqueue_import() to + * access the queue. + * + * @param [in] queue Queue handle from dspqueue_create() + * @param [out] queue_id Queue ID + * + * @return 0 on success, error code on failure. + */ +AEEResult dspqueue_export(dspqueue_t queue, uint64_t *queue_id); + +/** + * Import a queue on the DSP based on an ID passed in from the host + * CPU. The DSP client can use the returned queue handle to access the + * queue and communicate with its host CPU counterpart. + * + * @param [in] queue_id Queue ID from dspqueue_export(). + * @param [in] packet_callback Callback function called when there are new packets to read. + * The call will be done in a different thread's context. + * NULL to disable the callback. + * @param [in] error_callback Callback function called on unrecoverable errors. NULL to disable. + * @param [in] callback_context Context pointer fo callback functions + * @param [out] queue Queue handle + * + * @return 0 on success, error code on failure. + * - AEE_EITEMBUSY: The queue has already been imported + * - AEE_EQURTTHREADCREATE: Unable to create callback thread; the system may have + * reached its thread limit. + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_import(uint64_t queue_id, + dspqueue_callback_t packet_callback, + dspqueue_callback_t error_callback, + void *callback_context, + dspqueue_t *queue); +/** + * Write a packet to a queue. This variant of the function will not + * block, and will instead return AEE_EWOULDBLOCK if the queue does not have + * enough space for the packet. + * + * With this function the client can pass separate pointers to the + * buffer references and message to include in the packet and the + * library copies the contents directly to the queue. + * + * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import() + * @param [in] flags Packet flags. See enum #dspqueue_packet_flags + * @param [in] num_buffers Number of buffer references to insert to the packet; + * zero if there are no buffer references + * @param [in] buffers Pointer to buffer references + * @param [in] message_length Message length in bytes; + * zero if the packet contains no message + * @param [in] message Pointer to packet message + * + * @return 0 on success, error code on failure. + * - AEE_EWOULDBLOCK: The queue is full + * - AEE_EBADPARM: Bad parameters, e.g. buffers is NULL when num_buffers > 0 , + * The packet is too long to fit in the queue. The call will never succeed. + * - AEE_ENOSUCHMAP: Attempt to refer to an unmapped buffer. Buffers must be mapped to the DSP + * with fastrpc_mmap() before they can be used in queue packets. + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_write_noblock(dspqueue_t queue, uint32_t flags, + uint32_t num_buffers, struct dspqueue_buffer *buffers, + uint32_t message_length, const uint8_t *message); + +/** + * Write a packet to a queue. If the queue is full this function will + * block until space becomes available or the request times out. + * + * With this function the client can pass separate pointers to the + * buffer references and message to include in the packet and the + * library copies the contents directly to the queue. + * + * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import() + * @param [in] flags Packet flags. See enum #dspqueue_packet_flags + * @param [in] num_buffers Number of buffer references to insert to the packet; + * zero if there are no buffer references + * @param [in] buffers Pointer to buffer references + * @param [in] message_length Message length in bytes; + * zero if the packet contains no message + * @param [in] message Pointer to packet message + * @param [in] timeout_us Timeout in microseconds; use DSPQUEUE_TIMEOUT_NONE to + * block indefinitely until a space is available or + * zero for non-blocking behavior. + * + * @return 0 on success, error code on failure. + * - AEE_EBADPARM: Bad parameters, e.g. buffers is NULL when num_buffers > 0 + * The packet is too long to fit in the queue. The call will never succeed. + * - AEE_ENOSUCHMAP: Attempt to refer to an unmapped buffer. Buffers must be mapped to the DSP + * with fastrpc_mmap() before they can be used in queue packets. + * - AEE_EEXPIRED: Request timed out + * - AEE_EINTERRUPTED: The request was canceled + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_write(dspqueue_t queue, uint32_t flags, + uint32_t num_buffers, struct dspqueue_buffer *buffers, + uint32_t message_length, const uint8_t *message, + uint32_t timeout_us); + +/** + * Read a packet from a queue. This variant of the function will not + * block, and will instead return AEE_EWOULDBLOCK if the queue does not have + * enough space for the packet. + * + * This function will read packet contents directly into + * client-provided buffers. The buffers must be large enough to fit + * contents from the packet or the call will fail. + * + * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import() + * @param [out] flags Packet flags. See enum #dspqueue_packet_flags + * @param [in] max_buffers The maximum number of buffer references that can fit in the "buffers" parameter + * @param [out] num_buffers The number of buffer references in the packet + * @param [out] buffers Buffer reference data from the packet + * @param [in] max_message_length Maximum message length that can fit in the "message" parameter + * @param [out] message_length Message length in bytes + * @param [out] message Packet message + * + * @return 0 on success, error code on failure. + * - AEE_EBADPARM: Bad parameters, e.g. The packet is too large to fit in the provided buffers + * - AEE_ENOSUCHMAP: The packet refers to an unmapped buffer. Buffers must be mapped to the DSP + * with fastrpc_mmap() before they can be used in queue packets. + * - AEE_EWOULDBLOCK: The queue is empty; try again later + * - AEE_EBADITEM: The queue contains a corrupted packet. Internal error. + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_read_noblock(dspqueue_t queue, uint32_t *flags, + uint32_t max_buffers, uint32_t *num_buffers, struct dspqueue_buffer *buffers, + uint32_t max_message_length, uint32_t *message_length, uint8_t *message); + +/** + * Read a packet from a queue. If the queue is empty this function + * will block until a packet is available or the request times out. + * The queue must not have a packet callback set. + * + * This function will read packet contents directly into + * client-provided buffers. The buffers must be large enough to fit + * contents from the packet or the call will fail. + * + * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import() + * @param [out] flags Packet flags. See enum #dspqueue_packet_flags + * @param [in] max_buffers The maximum number of buffer references that can fit in the "buffers" parameter + * @param [out] num_buffers The number of buffer references in the packet + * @param [out] buffers Buffer reference data from the packet + * @param [in] max_message_length Maximum message length that can fit in the "message" parameter + * @param [out] message_length Message length in bytes + * @param [out] message Packet message + * @param [in] timeout_us Timeout in microseconds; use DSPQUEUE_TIMEOUT_NONE to + * block indefinitely until a packet is available or + * zero for non-blocking behavior. + * + * @return 0 on success, error code on failure. + * - AEE_EBADPARM: Bad parameters, e.g. The packet is too large to fit in the provided buffers + * - AEE_ENOSUCHMAP: The packet refers to an unmapped buffer. Buffers must be mapped to the DSP + * with fastrpc_mmap() before they can be used in queue packets. + * - AEE_EBADITEM: The queue contains a corrupted packet. Internal error. + * - AEE_EBADSTATE: Bad internal state + * - AEE_EEXPIRED: Request timed out + * - AEE_EINTERRUPTED: The request was canceled + */ +AEEResult dspqueue_read(dspqueue_t queue, uint32_t *flags, + uint32_t max_buffers, uint32_t *num_buffers, struct dspqueue_buffer *buffers, + uint32_t max_message_length, uint32_t *message_length, uint8_t *message, + uint32_t timeout_us); + +/** + * Retrieve information for the next packet if available, without reading + * it from the queue and advancing the read pointer. This function + * will not block, but will instead return an error if the queue is + * empty. + * + * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import(). + * @param [out] flags Packet flags. See enum #dspqueue_packet_flags + * @param [out] num_buffers Number of buffer references in packet + * @param [out] message_length Packet message length in bytes + * + * @return 0 on success, error code on failure. + * - AEE_EWOULDBLOCK: The queue is empty; try again later + * - AEE_EBADITEM: The queue contains a corrupted packet. Internal error. + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_peek_noblock(dspqueue_t queue, uint32_t *flags, uint32_t *num_buffers, + uint32_t *message_length); + +/** + * Retrieve information for the next packet, without reading it from the + * queue and advancing the read pointer. If the queue is empty this + * function will block until a packet is available or the request + * times out. + * + * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import(). + * @param [out] flags Packet flags. See enum #dspqueue_packet_flags + * @param [out] num_buffers Number of buffer references in packet + * @param [out] message_length Packet message length in bytes + * @param [out] timeout_us Timeout in microseconds; use DSPQUEUE_TIMEOUT_NONE to + * block indefinitely until a packet is available or + * zero for non-blocking behavior. + * + * @return 0 on success, error code on failure. + * - AEE_EEXPIRED: Request timed out + * - AEE_EINTERRUPTED: The request was canceled + * - AEE_EBADITEM: The queue contains a corrupted packet. Internal error. + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_peek(dspqueue_t queue, uint32_t *flags, uint32_t *num_buffers, + uint32_t *message_length, uint32_t timeout_us); + + +/** + * Write an early wakeup packet to the queue. Early wakeup packets are used + * to bring the recipient out of a low-power state in anticipation of a real + * message packet being availble shortly, and are typically used from the DSP + * to signal that an operation is almost complete. + * + * This function will return immediately if the queue is full. There is no + * blocking variant of this function; if the queue is full the other endpoint + * should already be processing data and an early wakeup would not be useful. + * + * + * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import() + * @param [in] wakeup_delay Wakeup time in microseconds; this indicates how soon + * the real message packet should be available. Zero if not known. + * The recipient can use this information to determine how to + * wait for the packet. + * @param [in] packet_flags Flags for the upcoming packet if known. + * The framework can use this information to optimize its + * behavior if the flags match the upcoming packet; if not known + * set to zero. + * See enum #dspqueue_packet_flags + * + * @return 0 on success, error code on failure. + * - AEE_EWOULDBLOCK: The queue is full + * - AEE_EBADSTATE: Bad internal state + */ +AEEResult dspqueue_write_early_wakeup_noblock(dspqueue_t queue, uint32_t wakeup_delay, uint32_t packet_flags); + + +/** + * Retrieve statistics from a queue. Statistics are relative to the queue + * as viewed from the current endpoint (e.g. "read queue" refers to the + * queue as being read by the current endpoint). + * + * Reading an accumulating statistic (such as early wakeup wait time) + * will reset it to zero. + * + * Note that statistics values are only valid at the time when they're + * read. By the time this function returns the values may have + * changed due to actions from another thread or the other queue + * endpoint. + * + * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import() + * @param [in] stat Statistic to read, see enum dspqueue_stat + * @param [out] value Statistic value. Reading a statistic will reset it to zero + * + * @return 0 on success, error code on failure. + * - AEE_EBADPARM: Invalid statistic + */ + +AEEResult dspqueue_get_stat(dspqueue_t queue, enum dspqueue_stat stat, uint64_t *value); + + +/** @} + */ + + +#ifdef __cplusplus +} +#endif + +#endif //DSPQUEUE_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.md new file mode 100755 index 0000000000000..3f1ff3206156b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.md @@ -0,0 +1,36 @@ +# Asynchronous DSP Packet Queue + +## API Overview {#api-overview} + +The Asynchronous DSP Packet Queue is accessed through a simple C +API. Most of the API is identical on both the host CPU and DSP with +the exception that queues can only be created on the CPU. + +* dspqueue_create(): Create a new queue. Queues can only be created + on the host CPU. + +* dspqueue_close(): Close a queue + +* dspqueue_export(): Export a queue on the host CPU, creating a + handle that be used with dspqueue_import() on the DSP. + +* dspqueue_import(): Import a queue for use on the DSP, using a + handle returned from dspqueue_export() on the CPU. + +* dspqueue_write() / dspqueue_write_noblock(): Write a packet to a + queue. Writes can either block if the queue is full or return an + error (dspqueue_write_noblock()); blocking writes can optionally + have a timeout. + +* dspqueue_read() / dspqueue_read_noblock(): Read a packet from a + queue. + +* dspqueue_peek() / dspqueue_peek_noblock(): Retrieve information + about the next packet without consuming it. + +* dspqueue_write_early_wakeup_noblock(): Write an early wakeup packet to the + queue. + +* dspqueue_get_stat(): Retrieve queue statistics, including the number + of packets queued and statistics about early wakeup. + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/dynsymbols.lst b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dynsymbols.lst new file mode 100755 index 0000000000000..17663f8ddb672 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dynsymbols.lst @@ -0,0 +1,1364 @@ +{ +___dladdr; +___dlclose; +___dlerror; +___dlopen; +___dlsym; +__assert; +__builtin_mr_assignment; +__builtin_pseudo_barrier; +__builtinfunction_bitrev_update1_for_load; +__builtinfunction_bitrev_update1_for_store; +__builtinfunction_bitrev_update2_for_load; +__builtinfunction_bitrev_update2_for_store; +__builtinfunction_bitrev_update4_for_load; +__builtinfunction_bitrev_update4_for_store; +__builtinfunction_bitrev_update8_for_load; +__builtinfunction_bitrev_update8_for_store; +__builtinfunction_bitreverse; +__builtinfunction_bitrevupdate; +__builtinfunction_circular_update1_for_load; +__builtinfunction_circular_update1_for_store; +__builtinfunction_circular_update1I_for_load; +__builtinfunction_circular_update1I_for_store; +__builtinfunction_circular_update2_for_load; +__builtinfunction_circular_update2_for_store; +__builtinfunction_circular_update2I_for_load; +__builtinfunction_circular_update2I_for_store; +__builtinfunction_circular_update4_for_load; +__builtinfunction_circular_update4_for_store; +__builtinfunction_circular_update4I_for_load; +__builtinfunction_circular_update4I_for_store; +__builtinfunction_circular_update8_for_load; +__builtinfunction_circular_update8_for_store; +__builtinfunction_circular_update8I_for_load; +__builtinfunction_circular_update8I_for_store; +__builtinfunction_circupdate; +__CTOR_END__; +__cxa_atexit; +__cxa_finalize; +__cxa_finalize_stub; +__cxa_guard_abort; +__cxa_guard_acquire; +__cxa_guard_release; +__cxa_pure_virtual; +__deallocframe; +__default_hash; +__deregister_frame_info_bases; +__divdc3; +__divsc3; +__divxc3; +__dladdr; +__dlclose; +__dlerror; +__dlopen; +__dlsym; +__dso_handle; +__DTOR_LIST__; +__eh_nodes; +__hexagon_adddf3; +__hexagon_addsf3; +__hexagon_cmpdf2; +__hexagon_cmpsf2; +__hexagon_cmpxdf2; +__hexagon_cmpxsf2; +__hexagon_divdf3; +__hexagon_divdi3; +__hexagon_divsf3; +__hexagon_divsi3; +__hexagon_eqdf2; +__hexagon_eqsf2; +__hexagon_extendsfdf2; +__hexagon_fast2_adddf3; +__hexagon_fast2_divdf3; +__hexagon_fast2_divsf3; +__hexagon_fast2_muldf3; +__hexagon_fast2_sqrt; +__hexagon_fast2_sqrtdf2; +__hexagon_fast2_sqrtf; +__hexagon_fast2_subdf3; +__hexagon_fast_adddf3; +__hexagon_fast_divdf3; +__hexagon_fast_divsf3; +__hexagon_fast_gtdf2; +__hexagon_fast_ltdf2; +__hexagon_fast_muldf3; +__hexagon_fast_negdf2; +__hexagon_fast_sqrt; +__hexagon_fast_sqrtdf2; +__hexagon_fast_sqrtf; +__hexagon_fast_subdf3; +__hexagon_fixdfdi; +__hexagon_fixdfsi; +__hexagon_fixdfti; +__hexagon_fixsfdi; +__hexagon_fixsfsi; +__hexagon_fixsfti; +__hexagon_fixunsdfdi; +__hexagon_fixunsdfsi; +__hexagon_fixunsdfti; +__hexagon_fixunssfdi; +__hexagon_fixunssfsi; +__hexagon_fixunssfti; +__hexagon_floatdidf; +__hexagon_floatdisf; +__hexagon_floatsidf; +__hexagon_floatsisf; +__hexagon_floattidf; +__hexagon_floattisf; +__hexagon_floatundidf; +__hexagon_floatundisf; +__hexagon_floatunsidf; +__hexagon_floatunsisf; +__hexagon_fmadf4; +__hexagon_gedf2; +__hexagon_gesf2; +__hexagon_gtdf2; +__hexagon_gtsf2; +__hexagon_ledf2; +__hexagon_lesf2; +__hexagon_ltdf2; +__hexagon_ltsf2; +__hexagon_maxdf3; +__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes; +__hexagon_mindf3; +__hexagon_moddi3; +__hexagon_modsi3; +__hexagon_muldf3; +__hexagon_mulsf3; +__hexagon_nedf2; +__hexagon_negdf2; +__hexagon_negsf2; +__hexagon_nesf2; +__hexagon_sqrt; +__hexagon_sqrtdf2; +__hexagon_sqrtf; +__hexagon_subdf3; +__hexagon_subsf3; +__hexagon_truncdfsf2; +__hexagon_udivdi3; +__hexagon_udivmoddi4; +__hexagon_udivmodsi4; +__hexagon_udivsi3; +__hexagon_umoddi3; +__hexagon_umodsi3; +__hexagon_unorddf2; +__hexagon_unordsf2; +__ieee754_j0; +__ieee754_j1; +__ieee754_jn; +__ieee754_log; +__ieee754_scalb; +__ieee754_y0; +__ieee754_y1; +__ieee754_yn; +__muldc3; +__mulsc3; +__mulxc3; +__qdsp_adddf3; +__qdsp_addsf3; +__qdsp_cmpdf2; +__qdsp_cmpsf2; +__qdsp_cmpxdf2; +__qdsp_cmpxsf2; +__qdsp_divdf3; +__qdsp_divdi3; +__qdsp_divsf3; +__qdsp_divsi3; +__qdsp_eqdf2; +__qdsp_eqsf2; +__qdsp_extendsfdf2; +__qdsp_fast_gtdf2; +__qdsp_fast_ltdf2; +__qdsp_fast_negdf2; +__qdsp_fixdfdi; +__qdsp_fixdfsi; +__qdsp_fixdfti; +__qdsp_fixsfdi; +__qdsp_fixsfsi; +__qdsp_fixsfti; +__qdsp_fixunsdfdi; +__qdsp_fixunsdfsi; +__qdsp_fixunsdfti; +__qdsp_fixunssfdi; +__qdsp_fixunssfsi; +__qdsp_fixunssfti; +__qdsp_floatdidf; +__qdsp_floatdisf; +__qdsp_floatsidf; +__qdsp_floatsisf; +__qdsp_floattidf; +__qdsp_floattisf; +__qdsp_floatundidf; +__qdsp_floatundisf; +__qdsp_floatunsidf; +__qdsp_floatunsisf; +__qdsp_fmadf5; +__qdsp_gedf2; +__qdsp_gesf2; +__qdsp_gtdf2; +__qdsp_gtsf2; +__qdsp_ledf2; +__qdsp_lesf2; +__qdsp_ltdf2; +__qdsp_ltsf2; +__qdsp_maxdf3; +__qdsp_memcpy_likely_aligned_min32bytes_mult8bytes; +__qdsp_mindf3; +__qdsp_moddi3; +__qdsp_modsi3; +__qdsp_muldf3; +__qdsp_mulsf3; +__qdsp_nedf2; +__qdsp_negdf2; +__qdsp_negsf2; +__qdsp_nesf2; +__qdsp_sqrt; +__qdsp_sqrtdf2; +__qdsp_sqrtf; +__qdsp_subdf3; +__qdsp_subsf3; +__qdsp_truncdfsf2; +__qdsp_udivdi3; +__qdsp_udivmoddi4; +__qdsp_udivmodsi4; +__qdsp_udivsi3; +__qdsp_umoddi3; +__qdsp_umodsi3; +__qdsp_unorddf2; +__qdsp_unordsf2; +__register_frame_info_bases; +__registerx; +__restore_r16_through_r17_and_deallocframe; +__restore_r16_through_r17_and_deallocframe_before_tailcall; +__restore_r16_through_r19_and_deallocframe; +__restore_r16_through_r19_and_deallocframe_before_tailcall; +__restore_r16_through_r21_and_deallocframe; +__restore_r16_through_r21_and_deallocframe_before_tailcall; +__restore_r16_through_r23_and_deallocframe; +__restore_r16_through_r23_and_deallocframe_before_tailcall; +__restore_r16_through_r25_and_deallocframe; +__restore_r16_through_r25_and_deallocframe_before_tailcall; +__restore_r16_through_r27_and_deallocframe; +__restore_r16_through_r27_and_deallocframe_before_tailcall; +__restore_r24_through_r25_and_deallocframe; +__restore_r24_through_r25_and_deallocframe_before_tailcall; +__restore_r24_through_r27_and_deallocframe; +__restore_r24_through_r27_and_deallocframe_before_tailcall; +__restore_r27_through_r16_and_deallocframe; +__restore_r27_through_r16_and_deallocframe_before_sibcall; +__restore_r27_through_r18_and_deallocframe; +__restore_r27_through_r18_and_deallocframe_before_sibcall; +__restore_r27_through_r20_and_deallocframe; +__restore_r27_through_r20_and_deallocframe_before_sibcall; +__restore_r27_through_r22_and_deallocframe; +__restore_r27_through_r22_and_deallocframe_before_sibcall; +__restore_r27_through_r24_and_deallocframe; +__restore_r27_through_r24_and_deallocframe_before_sibcall; +__restore_r27_through_r26_and_deallocframe; +__restore_r27_through_r26_and_deallocframe_before_sibcall; +__save_r16_through_r17; +__save_r16_through_r19; +__save_r16_through_r21; +__save_r16_through_r23; +__save_r16_through_r25; +__save_r16_through_r27; +__save_r24_through_r25; +__save_r24_through_r27; +__save_r27_through_r16; +__save_r27_through_r18; +__save_r27_through_r20; +__save_r27_through_r22; +__save_r27_through_r24; +__sqrtf; +__stack_chk_fail; +__stack_chk_guard; +__tls_get_addr; +__wrap_calloc; +__wrap_free; +__wrap_malloc; +__wrap_memalign; +__wrap_realloc; +_Aldata; +_Assert; +_Atan; +_AtcountPrivate; +_AtcountPublic; +_Atdata; +_Atexit; +_Atfuns; +_Atrealloc; +_Btowc; +_C_tolower_; +_C_toupper_; +_Caddcc; +_Caddcr; +_Cbuild; +_Cdivcc; +_Cdivcr; +_Clearlocks; +_Clocale; +_Closreg; +_Cmulcc; +_Cmulcr; +_Cosh; +_CStrftime; +_CStrxfrm; +_Csubcc; +_Csubcr; +_CTinfo; +_CurrentTimeLocale; +_CWcsxfrm; +_Daysto; +_Dbl; +_Dclass; +_DefaultTimeLocale; +_Defloc; +_Denorm; +_Dint; +_Dnorm; +_Dscale; +_Dsign; +_Dtentox; +_Dtest; +_Dunscale; +_Eps; +_Erf_one; +_Erf_small; +_Erfc; +_Exit; +_Exp; +_FAtan; +_FCaddcc; +_FCaddcr; +_FCbuild; +_FCdivcc; +_FCdivcr; +_FCmulcc; +_FCmulcr; +_FCosh; +_FCsubcc; +_FCsubcr; +_FDclass; +_FDenorm; +_FDint; +_FDnorm; +_FDscale; +_FDsign; +_FDtentox; +_FDtest; +_FDunscale; +_Fenv0; +_FEps; +_Feraise; +_FErf_one; +_FErf_small; +_FErfc; +_FExp; +_FFpcomp; +_FGamma_big; +_Fgpos; +_FHypot; +_Files; +_Findloc; +_FInf; +_fini; +_FLog; +_FLogpoly; +_Flt; +_Fltrounds; +_FNan; +_Fofind; +_Fofree; +_Fopen; +_Foprep; +_Force_raise; +_Fpcomp; +_FPoly; +_FPow; +_FQuad; +_FQuadph; +_Freeloc; +_FRint; +_Frprep; +_FRteps; +_FSin; +_FSinh; +_FSnan; +_Fspos; +_FTgamma; +_Fwprep; +_FXbig; +_FXp_addh; +_FXp_addx; +_FXp_getw; +_FXp_invx; +_FXp_ldexpx; +_FXp_movx; +_FXp_mulh; +_FXp_mulx; +_FXp_setw; +_FXp_sqrtx; +_FXp_subx; +_FZero; +_Gamma_big; +_Genld; +_Gentime; +_Get_eh_data; +_Getcloc; +_Getdst; +_Geterrno; +_Getfld; +_Getfloat; +_Getint; +_Getlname; +_Getloc; +_Getmbcurmax; +_Getmem; +_Getnloc; +_Getpcostate; +_Getpctype; +_Getpmbstate; +_Getptimes; +_Getptolower; +_Getptoupper; +_Getpwcostate; +_Getpwcstate; +_Getpwctrtab; +_Getpwctytab; +_Getstr; +_Gettime; +_Getzone; +_Hugeval; +_Hypot; +_Inf; +_init; +_Init_db; +_Initlocks; +_Isdst; +_Iswctype; +_LAtan; +_LCaddcc; +_LCaddcr; +_LCbuild; +_LCdivcc; +_LCdivcr; +_LCmulcc; +_LCmulcr; +_LCosh; +_LCsubcc; +_LCsubcr; +_Ldbl; +_LDclass; +_LDenorm; +_LDint; +_LDscale; +_LDsign; +_LDtentox; +_LDtest; +_Ldtob; +_LDunscale; +_LEps; +_LErf_one; +_LErf_small; +_LErfc; +_LExp; +_LFpcomp; +_LGamma_big; +_LHypot; +_LInf; +_Litob; +_LLog; +_LLogpoly; +_LNan; +_Lockfilelock; +_Locksyslock; +_Locsum; +_Loctab; +_Locterm; +_Locvar; +_Log; +_Logpoly; +_LPoly; +_LPow; +_LQuad; +_LQuadph; +_LRint; +_LRteps; +_LSin; +_LSinh; +_LSnan; +_LTgamma; +_LXbig; +_LXp_addh; +_LXp_addx; +_LXp_getw; +_LXp_invx; +_LXp_ldexpx; +_LXp_movx; +_LXp_mulh; +_LXp_mulx; +_LXp_setw; +_LXp_sqrtx; +_LXp_subx; +_LZero; +_Makeloc; +_Makestab; +_Makewct; +_Mbtowc; +_Mbtowcx; +_Nan; +_Nats; +_Nnl; +_Parse_cie; +_Parse_csd; +_Parse_fde; +_Parse_fde_instr; +_Parse_lsda; +_Poly; +_Pow; +_Printf; +_Putfld; +_Putstr; +_Puttxt; +_Quad; +_Quadph; +_Read_enc_ptr; +_Read_sleb; +_Read_uleb; +_Readloc; +_Rint; +_Rteps; +_Scanf; +_Setloc; +_Sin; +_Sinh; +_Size_block; +_Skip; +_Snan; +_start; +_Stderr; +_Stdin; +_Stdout; +_Stod; +_Stodx; +_Stof; +_Stoflt; +_Stofx; +_Stold; +_Stoldx; +_Stoll; +_Stollx; +_Stolx; +_Stopfx; +_Stoul; +_Stoull; +_Stoullx; +_Stoulx; +_Stoxflt; +_Strcollx; +_Strerror; +_Strxfrmx; +_Tgamma; +_tolower; +_tolower_tab_; +_toupper; +_toupper_tab_; +_Towctrans; +_Ttotm; +_Tzoff; +_Unlockfilelock; +_Unlocksyslock; +_Unwind_DeleteException; +_Unwind_ForcedUnwind; +_Unwind_GetDataRelBase; +_Unwind_GetGR; +_Unwind_GetIP; +_Unwind_GetLanguageSpecificData; +_Unwind_GetRegionStart; +_Unwind_GetTextRelBase; +_Unwind_RaiseException; +_Unwind_Resume; +_Unwind_Resume_or_Rethrow; +_Unwind_SetGR; +_Unwind_SetIP; +_Vacopy; +_Valbytes; +_Wcscollx; +_Wcsftime; +_Wcsxfrmx; +_Wctob; +_Wctomb; +_Wctombx; +_WFrprep; +_WFwprep; +_WGenld; +_WGetfld; +_WGetfloat; +_WGetint; +_WGetstr; +_WLdtob; +_WLitob; +_WPrintf; +_WPutfld; +_WPutstr; +_WPuttxt; +_WScanf; +_WStod; +_WStodx; +_WStof; +_WStoflt; +_WStofx; +_WStold; +_WStoldx; +_WStoll; +_WStopfx; +_WStoul; +_WStoull; +_WStoxflt; +_Xbig; +_Xp_addh; +_Xp_addx; +_Xp_getw; +_Xp_invx; +_Xp_ldexpx; +_Xp_movx; +_Xp_mulh; +_Xp_mulx; +_Xp_setw; +_Xp_sqrtx; +_Xp_subx; +_Zero; +a64l; +abort; +abs; +access; +acos; +acosf; +acosh; +acoshf; +acoshl; +acosl; +AHB_User_Base; +ahbb; +alarm; +alarm_handler; +alarmx; +asctime; +asctime_r; +asin; +asinf; +asinh; +asinhf; +asinhl; +asinl; +atan; +atan2; +atan2f; +atan2l; +atanf; +atanh; +atanhf; +atanhl; +atanl; +atexit; +atof; +atoi; +atol; +atoll; +bcmp; +bcopy; +bsearch; +btowc; +bzero; +c16rtomb; +c32rtomb; +cabs; +cabsf; +cabsl; +cacos; +cacosf; +cacosh; +cacoshf; +cacoshl; +cacosl; +carg; +cargf; +cargl; +casin; +casinf; +casinh; +casinhf; +casinhl; +casinl; +catan; +catanf; +catanh; +catanhf; +catanhl; +catanl; +cbrt; +cbrtf; +cbrtl; +ccos; +ccosf; +ccosh; +ccoshf; +ccoshl; +ccosl; +ceil; +ceilf; +ceill; +cexp; +cexpf; +cexpl; +cimag; +cimagf; +cimagl; +clearerr; +clock; +clog; +clog10; +clog10f; +clog10l; +clogf; +clogl; +close; +closedir; +conj; +conjf; +conjl; +copysign; +copysignf; +copysignl; +cos; +cosf; +cosh; +coshf; +coshl; +cosl; +cpow; +cpowf; +cpowl; +cproj; +cprojf; +cprojl; +creal; +crealf; +creall; +create_qdouble; +create_qdouble_li; +csin; +csinf; +csinh; +csinhf; +csinhl; +csinl; +csqrt; +csqrtf; +csqrtl; +ctan; +ctanf; +ctanh; +ctanhf; +ctanhl; +ctanl; +ctime; +ctime_r; +d2qd; +dadd; +dadd_asm; +daylight; +difftime; +div; +dladdr; +dlclose; +dlerror; +dlinfo; +dlopen; +dlopenbuf; +dlsym; +dmpy; +dmpy_asm; +drand48; +drecip; +drecipsqrt; +dsub; +dsub_asm; +ecvt; +environ; +erand48; +erf; +erfc; +erfcf; +erfcl; +erff; +erfl; +err_Fatal_internal0; +execve; +exit; +exp; +exp10f; +exp2; +exp2f; +exp2l; +expf; +expl; +expm1; +expm1f; +expm1l; +fabs; +fabsf; +fabsl; +fast2_d2qd; +fast2_d2qld; +fast2_dadd; +fast2_dadd_asm; +fast2_dmpy; +fast2_dmpy_asm; +fast2_drecip; +fast2_drecipsqrt; +fast2_dsub; +fast2_dsub_asm; +fast2_f2qd; +fast2_f2qd_asm; +fast2_ldadd; +fast2_ldadd_asm; +fast2_ldmpy; +fast2_ldmpy_asm; +fast2_ldrecip; +fast2_ldrecipsqrt; +fast2_ldsub; +fast2_ldsub_asm; +fast2_qd2f; +fast2_qd2f_asm; +fast2_qld2d; +fast2_recipsqrtTable_qd; +fast2_recipsqrtTable_qld; +fast2_recipTable_qd; +fast2_recipTable_qld; +fclose; +fcntl; +fcvt; +fdim; +fdimf; +fdiml; +fdopen; +feclearexcept; +fegetenv; +fegetexceptflag; +fegetround; +fegettrapenable; +feholdexcept; +feof; +feraiseexcept; +ferror; +fesetenv; +fesetexceptflag; +fesetround; +fesettrapenable; +fetestexcept; +feupdateenv; +fflush; +ffs; +fgetc; +fgetpos; +fgets; +fgetwc; +fgetws; +fileno; +floor; +floorf; +floorl; +fma; +fmaf; +fmax; +fmaxf; +fmaxl; +fmemcpy_asm; +fmin; +fminf; +fminl; +fmod; +fmodf; +fmodl; +fopen; +fork; +fprintf; +fputc; +fputs; +fputwc; +fputws; +fread; +freopen; +frexp; +frexpf; +frexpl; +fscanf; +fseek; +fseeko; +fsetpos; +fstat; +ftell; +ftello; +ftruncate; +fwide; +fwprintf; +fwrite; +fwscanf; +gcvt; +get_exp_qd; +get_mant_qd; +getc; +getc_unlocked; +getchar; +getchar_unlocked; +getcwd; +getenv; +getopt; +getpid; +gets; +getsubopt; +gettimeofday; +getw; +getwc; +getwchar; +gmtime; +gmtime_r; +h_acosf; +h_asinf; +h_atanf; +h_cosf; +h_exp10f; +h_exp2f; +h_expf; +h_log10f; +h_log2f; +h_logf; +h_sinf; +h_tanf; +hcreate; +hdestroy; +hexagon_buffer_clean; +hexagon_buffer_cleaninv; +hexagon_buffer_inv; +hexagon_cache_cleaninv; +hexagon_cache_inva; +hexagon_memcpy_forward_vp4cp4n2; +hexagon_reg_clear_timer; +hexagon_reg_end_timer; +hexagon_reg_init_timer; +hexagon_reg_prof_off; +hexagon_reg_prof_on; +hexagon_reg_read_pcycles; +hexagon_reg_read_rev; +hexagon_reg_read_syscfg; +hexagon_reg_show_timer; +hexagon_reg_start_timer; +hsearch; +hypot; +hypotf; +hypotl; +ilogb; +ilogbf; +ilogbl; +imaxabs; +imaxdiv; +index; +isalnum; +isalpha; +isascii; +isatty; +isblank; +iscntrl; +isdigit; +isgraph; +isinf; +islower; +isnan; +isprint; +ispunct; +isspace; +isupper; +iswalnum; +iswalpha; +iswblank; +iswcntrl; +iswctype; +iswdigit; +iswgraph; +iswlower; +iswprint; +iswpunct; +iswspace; +iswupper; +iswxdigit; +isxdigit; +j0; +j1; +jn; +jrand48; +l64a; +l64a_r; +labs; +lcong48; +ldexp; +ldexpf; +ldexpl; +ldiv; +lgamma; +lgammaf; +lgammal; +llabs; +lldiv; +llrint; +llrintf; +llrintl; +llround; +llroundf; +llroundl; +localeconv; +localtime; +localtime_r; +log; +log10; +log10f; +log10l; +log1p; +log1pf; +log1pl; +log2; +log2f; +log2l; +logb; +logbf; +logbl; +logf; +logl; +longjmp; +lrand48; +lrint; +lrintf; +lrintl; +lround; +lroundf; +lroundl; +lseek; +mblen; +mbrlen; +mbrtoc16; +mbrtoc32; +mbrtowc; +mbsinit; +mbsnrtowcs; +mbsrtowcs; +mbstowcs; +mbtowc; +memccpy; +memchr; +memcmp; +memcpy; +memcpy_c; +memcpy_v; +memmove; +memmove_c; +memmove_v; +memscpy; +memset; +memset_c; +memset_s; +memset_v; +memsmove; +mkdir; +mkstemp; +mktemp; +mktime; +modf; +modff; +modfl; +mrand48; +nan; +nanf; +nanl; +nearbyint; +nearbyintf; +nearbyintl; +nextafter; +nextafterf; +nextafterl; +nexttoward; +nexttowardf; +nexttowardl; +norm; +normf; +norml; +npa_query_by_name; +nrand48; +open; +opendir; +optarg; +opterr; +optind; +optopt; +perror; +pow; +powf; +powl; +printf; +putc; +putc_unlocked; +putchar; +putchar_unlocked; +putenv; +puts; +putw; +putwc; +putwchar; +q6_buffer_clean; +q6_buffer_cleaninv; +q6_buffer_inv; +q6reg_clear_timer; +q6reg_end_timer; +q6reg_init_timer; +q6reg_prof_off; +q6reg_prof_on; +q6reg_read_pcycles; +q6reg_read_rev; +q6reg_read_syscfg; +q6reg_show_timer; +q6reg_start_timer; +qd2d; +qd_add; +qd_add_dq; +qd_add_qd; +qd_div; +qd_div_dq; +qd_div_qd; +qd_fabs; +qd_gt; +qd_gt_dq; +qd_gt_qd; +qd_lt; +qd_lt_dq; +qd_lt_qd; +qd_mul; +qd_mul_dq; +qd_mul_qd; +qd_neg; +qd_self_div; +qd_self_div_dd; +qd_self_increment; +qd_self_increment_qd; +qd_self_mul; +qd_self_mul_qd; +qd_self_sub; +qd_self_sub_dd; +qd_sqrt; +qd_sub_dq; +qd_sub_qd; +qsort; +raise; +rand; +rand_r; +read; +readdir; +recipsqrtTable_qd; +recipTable_qd; +remainder; +remainderf; +remainderl; +remove; +remquo; +remquof; +remquol; +rename; +rewind; +rindex; +rint; +rintf; +rintl; +rmdir; +rmemcpy_asm; +round; +roundf; +roundl; +sbrk; +scalb; +scalbln; +scalblnf; +scalblnl; +scalbn; +scalbnf; +scalbnl; +scanf; +seed48; +set_exp_qd; +set_mant_qd; +setbuf; +setjmp; +setlocale; +setvbuf; +signal; +sin; +sinf; +sinh; +sinhf; +sinhl; +sinl; +sleep; +snprintf; +sprintf; +sqrt; +sqrtf; +sqrtl; +srand; +srand48; +sscanf; +start; +stat; +statvfs; +strcasecmp; +strcat; +strchr; +strcmp; +strcmp_c; +strcoll; +strcpy; +strcspn; +strdup; +strerror; +strerror_r; +strftime; +strlcat; +strlcpy; +strlen; +strncasecmp; +strncat; +strncmp; +strncpy; +strpbrk; +strptime; +strrchr; +strsep; +strspn; +strstr; +strtod; +strtof; +strtoimax; +strtok; +strtok_r; +strtol; +strtold; +strtoll; +strtoul; +strtoull; +strtoumax; +strxfrm; +suboptarg; +swab; +swprintf; +swscanf; +sys_Mtxinit; +sys_Mtxlock; +sys_Mtxunlock; +sys_Tlsalloc; +sys_Tlsget; +sys_Tlsset; +sysconf; +system; +tan; +tanf; +tanh; +tanhf; +tanhl; +tanl; +tdelete; +tempnam; +tfind; +tgamma; +tgammaf; +tgammal; +time; +times; +timezone; +tmpfile; +tmpnam; +toascii; +todouble; +tolower; +toqdouble; +toupper; +towctrans; +towlower; +towupper; +trunc; +truncf; +truncl; +tsearch; +twalk; +tzname; +tzset; +u2g_client_open; +ungetc; +ungetwc; +unlink; +vasprintf; +vfprintf; +vfscanf; +vfwprintf; +vfwscanf; +vprintf; +vscanf; +vsnprintf; +vsprintf; +vsscanf; +vswprintf; +vswscanf; +vwprintf; +vwscanf; +wcrtomb; +wcscat; +wcschr; +wcscmp; +wcscoll; +wcscpy; +wcscspn; +wcsftime; +wcslen; +wcsncat; +wcsncmp; +wcsncpy; +wcsnrtombs; +wcspbrk; +wcsrchr; +wcsrtombs; +wcsspn; +wcsstr; +wcstod; +wcstof; +wcstoimax; +wcstok; +wcstol; +wcstold; +wcstoll; +wcstombs; +wcstoul; +wcstoull; +wcstoumax; +wcsxfrm; +wctob; +wctomb; +wctrans; +wctype; +wmemchr; +wmemcmp; +wmemcpy; +wmemmove; +wmemset; +wprintf; +write; +wscanf; +y0; +y1; +yn; +}; diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/qtest_stdlib.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/qtest_stdlib.h new file mode 100755 index 0000000000000..0da356fd1a46c --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/qtest_stdlib.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2012-2013,2021 QUALCOMM Technologies Inc. All Rights Reserved. + * Qualcomm Technologies Confidential and Proprietary + * + */ +#ifndef QTEST_STDLIB_H +#define QTEST_STDLIB_H + +#include +#include "rpcmem.h" + +#define WHILE(a) \ +__pragma(warning(suppress:4127)) while(a) + +#define FREEIF(pv) \ + do {\ + if(pv) { \ + void* tmp = (void*)pv;\ + pv = 0;\ + FREE(tmp);\ + } \ + } WHILE(0) + +#define ALIGNED_FREEIF(pv) \ + do {\ + if(pv) { \ + void* tmp = (void*)pv;\ + pv = 0;\ + ALIGNED_FREE(tmp);\ + } \ + } WHILE(0) + +#ifndef FASTRPC_DMA_FREE +#define FASTRPC_DMA_FREE(pv) rpcmem_free(pv) +#endif + +#define FASTRPC_DMA_FREEIF(pv) \ + do {\ + if(pv) { \ + void* tmp = (void*)pv;\ + pv = 0;\ + FASTRPC_DMA_FREE(tmp);\ + } \ + } WHILE(0) + +#ifndef QASSERT +#define QASSERT(st) assert(st) +#endif + +//had to copy this so not to bring in a1qtest headers +#if (((defined __linux__) && !(defined ANDROID)) || (defined __APPLE__)) +#include +#include +#include + +static __inline char* stacktrace(void) { + int bufsz = 0, sz = 0; + char* buf = 0; + void* callstack[256]; + int i, frames = backtrace(callstack, 256); + char** strs = backtrace_symbols(callstack, frames); + bufsz += snprintf(0, 0, "\n"); + for (i = 0; i < frames; ++i) { + bufsz += snprintf(0, 0, "%s\n", strs[i]); + } + buf = malloc(bufsz); + assert(buf != 0); + sz += snprintf(buf + sz, bufsz, "\n"); + bufsz -= sz; + for (i = 0; i < frames && bufsz > 0; ++i) { + sz += snprintf(buf + sz, bufsz, "%s\n", strs[i]); + bufsz -= sz; + } + free(strs); + return buf; +} + +#else + +static __inline char* stacktrace(void) { + return 0; +} + + +#endif //ANDROID + + +#ifndef QTEST +//default implementation for stdlib +#include + + +#define IF_QTEST(vv) (void)0 + +#ifndef QASSERT +#define QASSERT(st) (void)0 +#endif + +#ifndef MALLOC +#define MALLOC malloc +#endif + +#ifndef FASTRPC_DMA_MALLOC +#define FASTRPC_DMA_MALLOC(heapid, flags, size) rpcmem_alloc(heapid, flags, size) +#endif + +#ifndef CALLOC +#define CALLOC calloc +#endif + +#ifndef FREE +#define FREE free +#endif + +#ifndef REALLOC +#define REALLOC(pv, nsz, osz) realloc(pv, nsz) +#endif + +#ifndef ALIGNED_REALLOC +#define ALIGNED_REALLOC(pv, nsz, osz, aln) _aligned_realloc(pv, nsz, aln) +#endif + +#ifndef FASTRPC_DMA_REALLOC +#define FASTRPC_DMA_REALLOC(pv, nsz, osz, aln) fastrpc_dma_realloc(pv, nsz, osz) +#endif + +#ifndef ALIGNED_FREE +#define ALIGNED_FREE(pv) _aligned_free(pv) +#endif + +#define qtest_set_failure_mask(mask) (void)mask +#define qtest_get_failure_mask(mask) (void)mask +#define qtest_set_pass_count(cnt) (void)cnt +#define qtest_done() (void)0 +#define qtest_test_failure() 0 +#define qtest_atexit(pfn,ctx) (void)pfn; (void)ctx + +#else // QTEST + +#include "AEEStdDef.h" + +#define IF_QTEST(vv) do {\ + vv \ +} while (0) + +//causes alloc to fail when mask & 0x1 is true +//each test shifts the mask to the right +void qtest_set_failure_mask(uint32 mask); +void qtest_get_failure_mask(uint32* mask); + +//causes alloc to fail when count == 0 +//each test decrements the count +void qtest_set_pass_count(int count); + +//returns 0 if succeeds and shifts the mask +//usefull for generating controlled failures in functions +int qtest_test_failure(void); + +void qtest_atexit(void (*pfnAtExit)(void* pCtx), void* pvCxt); + +void qtest_done(void); + +void* qtest_malloc(const char* name, char* stack, int sz); + +void* qtest_calloc(const char* name, char* stack, int cnt, int sz); + +void* qtest_realloc(const char* name, char* stack, void* ptr, int sz); + +void qtest_free(const char* name, char* stack, void* rv); + +#define MALLOC(sz) qtest_malloc(__FILE_LINE__, stacktrace(), sz) +#define CALLOC(cnt, sz) qtest_calloc(__FILE_LINE__, stacktrace(), cnt, sz) +#define REALLOC(ptr, sz) qtest_realloc(__FILE_LINE__, stacktrace(), ptr, sz) +#define FREE(ptr) qtest_free(__FILE_LINE__, stacktrace(), ptr) + +#endif //QTEST +#endif //QTEST_STDLIB_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.h new file mode 100755 index 0000000000000..be2e864346b29 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.h @@ -0,0 +1,1430 @@ +/* + * Copyright (c) 2012-2014,2016,2017,2019-2022,2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + */ +#ifndef REMOTE_H +#define REMOTE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef __QAIC_REMOTE +#define __QAIC_REMOTE(ff) ff +#endif ///__QAIC_REMOTE + +#ifndef __QAIC_REMOTE_EXPORT +#ifdef _WIN32 +#ifdef _USRDLL +#define __QAIC_REMOTE_EXPORT __declspec(dllexport) +#elif defined(STATIC_LIB) +#define __QAIC_REMOTE_EXPORT /** Define for static libk */ +#else ///STATIC_LIB +#define __QAIC_REMOTE_EXPORT __declspec(dllimport) +#endif ///_USRDLL +#else +#define __QAIC_REMOTE_EXPORT +#endif ///_WIN32 +#endif ///__QAIC_REMOTE_EXPORT + +#ifndef __QAIC_RETURN +#ifdef _WIN32 +#define __QAIC_RETURN _Success_(return == 0) +#else +#define __QAIC_RETURN +#endif ///_WIN32 +#endif ///__QAIC_RETURN + +#ifndef __QAIC_IN +#ifdef _WIN32 +#define __QAIC_IN _In_ +#else +#define __QAIC_IN +#endif ///_WIN32 +#endif ///__QAIC_IN + +#ifndef __QAIC_IN_CHAR +#ifdef _WIN32 +#define __QAIC_IN_CHAR _In_z_ +#else +#define __QAIC_IN_CHAR +#endif ///_WIN32 +#endif ///__QAIC_IN_CHAR + +#ifndef __QAIC_IN_LEN +#ifdef _WIN32 +#define __QAIC_IN_LEN(len) _Inout_updates_bytes_all_(len) +#else +#define __QAIC_IN_LEN(len) +#endif ///_WIN32 +#endif ///__QAIC_IN_LEN + +#ifndef __QAIC_OUT +#ifdef _WIN32 +#define __QAIC_OUT _Out_ +#else +#define __QAIC_OUT +#endif ///_WIN32 +#endif ///__QAIC_OUT + +#ifndef __QAIC_INT64PTR +#ifdef _WIN32 +#define __QAIC_INT64PTR uintptr_t +#else +#define __QAIC_INT64PTR uint64_t +#endif ///_WIN32 +#endif ///__QAIC_INT64PTR + +#ifndef __QAIC_REMOTE_ATTRIBUTE +#define __QAIC_REMOTE_ATTRIBUTE +#endif ///__QAIC_REMOTE_ATTRIBUTE + +/** Retrieves method attribute from the scalars parameter */ +#define REMOTE_SCALARS_METHOD_ATTR(dwScalars) (((dwScalars) >> 29) & 0x7) + +/** Retrieves method index from the scalars parameter */ +#define REMOTE_SCALARS_METHOD(dwScalars) (((dwScalars) >> 24) & 0x1f) + +/** Retrieves number of input buffers from the scalars parameter */ +#define REMOTE_SCALARS_INBUFS(dwScalars) (((dwScalars) >> 16) & 0x0ff) + +/** Retrieves number of output buffers from the scalars parameter */ +#define REMOTE_SCALARS_OUTBUFS(dwScalars) (((dwScalars) >> 8) & 0x0ff) + +/** Retrieves number of input handles from the scalars parameter */ +#define REMOTE_SCALARS_INHANDLES(dwScalars) (((dwScalars) >> 4) & 0x0f) + +/** Retrieves number of output handles from the scalars parameter */ +#define REMOTE_SCALARS_OUTHANDLES(dwScalars) ((dwScalars) & 0x0f) + +/** Makes the scalar using the method attr, index and number of io buffers and handles */ +#define REMOTE_SCALARS_MAKEX(nAttr,nMethod,nIn,nOut,noIn,noOut) \ + ((((uint32_t) (nAttr) & 0x7) << 29) | \ + (((uint32_t) (nMethod) & 0x1f) << 24) | \ + (((uint32_t) (nIn) & 0xff) << 16) | \ + (((uint32_t) (nOut) & 0xff) << 8) | \ + (((uint32_t) (noIn) & 0x0f) << 4) | \ + ((uint32_t) (noOut) & 0x0f)) + +#define REMOTE_SCALARS_MAKE(nMethod,nIn,nOut) REMOTE_SCALARS_MAKEX(0,nMethod,nIn,nOut,0,0) + +/** Retrieves number of io buffers and handles */ +#define REMOTE_SCALARS_LENGTH(sc) (REMOTE_SCALARS_INBUFS(sc) +\ + REMOTE_SCALARS_OUTBUFS(sc) +\ + REMOTE_SCALARS_INHANDLES(sc) +\ + REMOTE_SCALARS_OUTHANDLES(sc)) + +/** Defines the domain IDs for supported DSPs */ +#define ADSP_DOMAIN_ID 0 +#define MDSP_DOMAIN_ID 1 +#define SDSP_DOMAIN_ID 2 +#define CDSP_DOMAIN_ID 3 +#define CDSP1_DOMAIN_ID 4 + +/** Defines the domain names for supported DSPs*/ +#define ADSP_DOMAIN_NAME "adsp" +#define MDSP_DOMAIN_NAME "mdsp" +#define SDSP_DOMAIN_NAME "sdsp" +#define CDSP_DOMAIN_NAME "cdsp" +#define CDSP1_DOMAIN_NAME "cdsp1" + +/** Defines to prepare URI for multi-domain calls */ +#define ADSP_DOMAIN "&_dom=adsp" +#define MDSP_DOMAIN "&_dom=mdsp" +#define SDSP_DOMAIN "&_dom=sdsp" +#define CDSP_DOMAIN "&_dom=cdsp" +#define CDSP1_DOMAIN "&_dom=cdsp1" + +/** Internal transport prefix */ +#define ITRANSPORT_PREFIX "'\":;./\\" + +/** Maximum length of URI for remote_handle_open() calls */ +#define MAX_DOMAIN_URI_SIZE 12 + +/** Token to specify the priority of a handle */ +#define FASTRPC_URI_PRIORITY_TOKEN "&_hpriority=" + +/** Macro to generate token string for priority */ +#define FASTRPC_HANDLE_PRIORITY_LEVEL(priority) \ + FASTRPC_URI_PRIORITY_TOKEN #priority + +/** + * The following defines are used to specify the priority level of a handle. + * Priority levels range from 1 to 7. Lower numbers indicate higher priority. + * For example, a priority of 1 indicates the highest priority while a priority + * of 7 indicates the lowest priority. + * + * If no priority level is specified, then handles are opened with highest + * priority. + */ +#define FASTRPC_HANDLE_PRIORITY_MIN 7 +#define FASTRPC_HANDLE_PRIORITY_MAX 1 + +/** Domain type for multi-domain RPC calls */ +typedef struct domain_t { + /** Domain ID */ + int id; + /** URI for remote_handle_open */ + char uri[MAX_DOMAIN_URI_SIZE]; +} domain; + +/** Remote handle parameter for RPC calls */ +typedef uint32_t remote_handle; + +/** Remote handle parameter for multi-domain RPC calls */ +typedef uint64_t remote_handle64; + +/** 32-bit Remote buffer parameter for RPC calls */ +typedef struct { + /** Address of a remote buffer */ + void *pv; + /** Size of a remote buffer */ + size_t nLen; +} remote_buf; + +/** 64-bit Remote buffer parameter for RPC calls */ +typedef struct { + /** Address of a remote buffer */ + uint64_t pv; + /** Size of a remote buffer */ + int64_t nLen; +} remote_buf64; + +/** 32-bit Remote DMA handle parameter for RPC calls */ +typedef struct { + /** File descriptor of a remote buffer */ + int32_t fd; + /** Offset of the file descriptor */ + uint32_t offset; +} remote_dma_handle; + +/** 64-bit Remote DMA handle parameter for RPC calls */ +typedef struct { + /** File descriptor of a remote buffer */ + int32_t fd; + /** Offset of the file descriptor */ + uint32_t offset; + /** Size of buffer */ + uint32_t len; +} remote_dma_handle64; + +/** 32-bit Remote Arg structure for RPC calls */ +typedef union { + /** 32-bit remote buffer */ + remote_buf buf; + /** non-domains remote handle */ + remote_handle h; + /** multi-domains remote handle */ + remote_handle64 h64; + /** 32-bit remote dma handle */ + remote_dma_handle dma; +} remote_arg; + +/** 64-bit Remote Arg structure for RPC calls */ +typedef union { + /** 64-bit remote buffer */ + remote_buf64 buf; + /** non-domains remote handle */ + remote_handle h; + /** multi-domains remote handle */ + remote_handle64 h64; + /** 64-bit remote dma handle */ + remote_dma_handle64 dma; +} remote_arg64; + +/** Async response type */ +enum fastrpc_async_notify_type { + /** No notification required */ + FASTRPC_ASYNC_NO_SYNC, + + /** Callback notification using fastrpc_async_callback */ + FASTRPC_ASYNC_CALLBACK, + + /** User will poll for the notification */ + FASTRPC_ASYNC_POLL, + +/** Update FASTRPC_ASYNC_TYPE_MAX when adding new value to this enum */ +}; + +/** Job id of Async job queued to DSP */ +typedef uint64_t fastrpc_async_jobid; + +/** Async call back response type, input structure */ +typedef struct fastrpc_async_callback { + /** Callback function for async notification */ + void (*fn)(fastrpc_async_jobid jobid, void* context, int result); + /** Current context to identify the callback */ + void *context; +}fastrpc_async_callback_t; + +/** Async descriptor to submit async job */ +typedef struct fastrpc_async_descriptor { + /** Async response type */ + enum fastrpc_async_notify_type type; + /** Job id of Async job queued to DSP */ + fastrpc_async_jobid jobid; + /** Async call back response type */ + fastrpc_async_callback_t cb; +}fastrpc_async_descriptor_t; + + +/** + * Flags used in struct remote_rpc_control_latency + * for request ID DSPRPC_CONTROL_LATENCY + * in remote handle control interface + **/ +enum remote_rpc_latency_flags { + + /** Request ID to disable QOS */ + RPC_DISABLE_QOS, + + /** Control cpu low power modes based on RPC activity in 100 ms window. + * Recommended for latency sensitive use cases. + */ + RPC_PM_QOS, + + /** DSP driver predicts completion time of a method and send CPU wake up signal to reduce wake up latency. + * Recommended for moderate latency sensitive use cases. It is more power efficient compared to pm_qos control. + */ + RPC_ADAPTIVE_QOS, + + /** + * After sending invocation to DSP, CPU will enter polling mode instead of + * waiting for a glink response. This will boost fastrpc performance by + * reducing the CPU wakeup and scheduling times. Enabled only for sync RPC + * calls. Using this option also enables PM QoS with a latency of 100 us. + */ + RPC_POLL_QOS, +}; + +/** + * Structure used for request ID `DSPRPC_CONTROL_LATENCY` + * in remote handle control interface + **/ +struct remote_rpc_control_latency { +/** Enable latency optimization techniques to meet requested latency. Use remote_rpc_latency_flags */ + uint32_t enable; + +/** + * Latency in microseconds. + * + * When used with RPC_PM_QOS or RPC_ADAPTIVE_QOS, user should pass maximum RPC + * latency that can be tolerated. It is not guaranteed that fastrpc will meet + * this requirement. 0 us latency is ignored. Recommended value is 100. + * + * When used with RPC_POLL_QOS, user needs to pass the expected execution time + * of method on DSP. CPU will poll for a DSP response for that specified duration + * after which it will timeout and fall back to waiting for a glink response. + * Max value that can be passed is 10000 (10 ms) + */ + uint32_t latency; +}; + +/** + * @struct fastrpc_capability + * @brief Argument to query DSP capability with request ID DSPRPC_GET_DSP_INFO + */ +typedef struct remote_dsp_capability { + /** @param[in] : DSP domain ADSP_DOMAIN_ID, SDSP_DOMAIN_ID, or CDSP_DOMAIN_ID */ + uint32_t domain; + /** @param[in] : One of the DSP/kernel attributes from enum remote_dsp_attributes */ + uint32_t attribute_ID; + /** @param[out] : Result of the DSP/kernel capability query based on attribute_ID */ + uint32_t capability; +}fastrpc_capability; + + +/** + * @enum remote_dsp_attributes + * @brief Different types of DSP capabilities queried via remote_handle_control + * using DSPRPC_GET_DSP_INFO request id. + * DSPRPC_GET_DSP_INFO should only be used with remote_handle_control() as a handle + * is not needed to query DSP capabilities. + * To query DSP capabilities fill out 'domain' and 'attribute_ID' from structure + * remote_dsp_capability. DSP capability will be returned on variable 'capability'. + */ +enum remote_dsp_attributes { + /** Check if DSP supported: supported = 1, + unsupported = 0 */ + DOMAIN_SUPPORT, + + /** DSP unsigned PD support: supported = 1, + unsupported = 0 */ + UNSIGNED_PD_SUPPORT, + + /** Number of HVX 64B support */ + HVX_SUPPORT_64B, + + /** Number of HVX 128B support */ + HVX_SUPPORT_128B, + + /** Max page size allocation possible in VTCM */ + VTCM_PAGE, + + /** Number of page_size blocks available */ + VTCM_COUNT, + + /** Hexagon processor architecture version */ + ARCH_VER, + + /** HMX Support Depth */ + HMX_SUPPORT_DEPTH, + + /** HMX Support Spatial */ + HMX_SUPPORT_SPATIAL, + + /** Async FastRPC Support */ + ASYNC_FASTRPC_SUPPORT, + + /** DSP User PD status notification Support */ + STATUS_NOTIFICATION_SUPPORT , + + /** Multicast widget programming */ + MCID_MULTICAST, + + /** Mapping in extended address space on DSP */ + EXTENDED_MAP_SUPPORT, + + /** DSP support for handle priority */ + HANDLE_PRIORITY_SUPPORT , + + /** Update FASTRPC_MAX_DSP_ATTRIBUTES when adding new value to this enum */ +}; + +/** Macro for backward compatibility. Clients can compile wakelock request code + * in their app only when this is defined + */ +#define FASTRPC_WAKELOCK_CONTROL_SUPPORTED 1 + +/** + * Structure used for request ID `DSPRPC_CONTROL_WAKELOCK` + * in remote handle control interface + **/ +struct remote_rpc_control_wakelock { + /** enable control of wake lock */ + uint32_t enable; +}; + +/** + * Structure used for request ID `DSPRPC_GET_DOMAIN` + * in remote handle control interface. + * Get domain ID associated with an opened handle to remote interface of type remote_handle64. + * remote_handle64_control() returns domain for a given handle + * remote_handle_control() API returns default domain ID + */ +typedef struct remote_rpc_get_domain { + /** @param[out] : domain ID associcated with handle */ + int domain; +} remote_rpc_get_domain_t; + +/** + * Structure used for request IDs `DSPRPC_SET_PATH` and + * `DSPRPC_GET_PATH` in remote handle control interface. + */ +struct remote_control_custom_path { + /** value size including NULL char */ + int32_t value_size; + /** key used for storing the path */ + const char* path; + /** value which will be used for file operations when the corresponding key is specified in the file URI */ + char* value; +}; + +/** + * Request IDs for remote handle control interface + **/ +enum handle_control_req_id { + /** Reserved */ + DSPRPC_RESERVED, + + /** Request ID to enable/disable QOS */ + DSPRPC_CONTROL_LATENCY , + + /** Request ID to get dsp capabilites from kernel and Hexagon */ + DSPRPC_GET_DSP_INFO, + + /** Request ID to enable wakelock for the given domain */ + DSPRPC_CONTROL_WAKELOCK, + + /** Request ID to get the default domain or domain associated to an exisiting handle */ + DSPRPC_GET_DOMAIN, + + /** Request ID to add a custom path to the hash table */ + DSPRPC_SET_PATH, + + /** Request ID to read a custom path to the hash table */ + DSPRPC_GET_PATH, + +}; + +/** + * Structure used for request ID `FASTRPC_THREAD_PARAMS` + * in remote session control interface + **/ +struct remote_rpc_thread_params { + /** Remote subsystem domain ID, pass -1 to set params for all domains */ + int domain; + /** User thread priority (1 to 255), pass -1 to use default */ + int prio; + /** User thread stack size, pass -1 to use default */ + int stack_size; +}; + +/** + * Structure used for request ID `DSPRPC_CONTROL_UNSIGNED_MODULE` + * in remote session control interface + **/ +struct remote_rpc_control_unsigned_module { + /** Remote subsystem domain ID, -1 to set params for all domains */ + int domain; + /** Enable unsigned module loading */ + int enable; +}; + +/** + * Structure used for request ID `FASTRPC_RELATIVE_THREAD_PRIORITY` + * in remote session control interface + **/ +struct remote_rpc_relative_thread_priority { + /** Remote subsystem domain ID, pass -1 to update priority for all domains */ + int domain; + /** the value by which the default thread priority needs to increase/decrease + * DSP thread priorities run from 1 to 255 with 1 being the highest thread priority. + * So a negative relative thread priority value will 'increase' the thread priority, + * a positive value will 'decrease' the thread priority. + */ + int relative_thread_priority; +}; + +/** + * When a remote invocation does not return, + * then call "remote_session_control" with FASTRPC_REMOTE_PROCESS_KILL requestID + * and the appropriate remote domain ID. Once remote process is successfully + * killed, before attempting to create new session, user is expected to + * close all open handles for shared objects in case of domains. + * And, user is expected to unload all shared objects including + * libcdsprpc.so/libadsprpc.so/libmdsprpc.so/libsdsprpc.so in case of non-domains. + */ +struct remote_rpc_process_clean_params { + /** Domain ID to recover process */ + int domain; +}; + +/** + * Structure used for request ID `FASTRPC_SESSION_CLOSE` + * in remote session control interface + **/ +struct remote_rpc_session_close { + /** Remote subsystem domain ID, -1 to close all handles for all domains */ + int domain; +}; + +/** + * Structure used for request ID `FASTRPC_CONTROL_PD_DUMP` + * in remote session control interface + * This is used to enable/disable PD dump for userPDs on the DSP + **/ +struct remote_rpc_control_pd_dump { + /** Remote subsystem domain ID, -1 to set params for all domains */ + int domain; + /** Enable PD dump of user PD on the DSP */ + int enable; +}; + +/** + * Structure used for request ID `FASTRPC_REMOTE_PROCESS_EXCEPTION` + * in remote session control interface + * This is used to trigger exception in the userPDs running on the DSP + **/ +typedef struct remote_rpc_process_clean_params remote_rpc_process_exception; + +/** + * Process types + * Return values for FASTRPC_REMOTE_PROCESS_TYPE control req ID for remote_handle_control + * Return values denote the type of process on remote subsystem +**/ +enum fastrpc_process_type { + /** Signed PD running on the DSP */ + PROCESS_TYPE_SIGNED, + + /** Unsigned PD running on the DSP */ + PROCESS_TYPE_UNSIGNED, + +}; + +/** + * Structure for remote_session_control, + * used with FASTRPC_REMOTE_PROCESS_TYPE request ID + * to query the type of PD running defined by enum fastrpc_process_type + * @param[in] : Domain of process + * @param[out] : Process_type belonging to enum fastrpc_process_type + */ +struct remote_process_type { + /** @param[in] : Domain of process */ + int domain; + /** @param[out] : Process_type belonging to enum fastrpc_process_type */ + int process_type; +}; + +/** + * DSP user PD status notification flags + * Status flags for the user PD on the DSP returned by the status notification function + * +**/ +typedef enum remote_rpc_status_flags { + /** DSP user process is up */ + FASTRPC_USER_PD_UP, + + /** DSP user process exited */ + FASTRPC_USER_PD_EXIT, + + /** DSP user process forcefully killed. Happens when DSP resources needs to be freed. */ + FASTRPC_USER_PD_FORCE_KILL, + + /** Exception in the user process of DSP. */ + FASTRPC_USER_PD_EXCEPTION, + + /** Subsystem restart of the DSP, where user process is running. */ + FASTRPC_DSP_SSR, + +} remote_rpc_status_flags_t; + +/** + * fastrpc_notif_fn_t + * Notification call back function + * + * @param context, context used in the registration + * @param domain, domain of the user process + * @param session, session id of user process + * @param status, status of user process + * @retval, 0 on success + */ +typedef int (*fastrpc_notif_fn_t)(void *context, int domain, int session, remote_rpc_status_flags_t status); + + +/** + * Structure for remote_session_control, + * used with FASTRPC_REGISTER_STATUS_NOTIFICATIONS request ID + * to receive status notifications of the user PD on the DSP +**/ +typedef struct remote_rpc_notif_register { + /** @param[in] : Context of the client */ + void *context; + /** @param[in] : DSP domain ADSP_DOMAIN_ID, SDSP_DOMAIN_ID, or CDSP_DOMAIN_ID */ + int domain; + /** @param[in] : Notification function pointer */ + fastrpc_notif_fn_t notifier_fn; +} remote_rpc_notif_register_t; + +/** + * Structure for remote_session_control, + * used with FASTRPC_PD_INITMEM_SIZE request ID + * to set signed userpd initial memory size + **/ +struct remote_rpc_pd_initmem_size { + /** Remote subsystem domain ID, pass -1 to set params for all domains **/ + int domain; + /** Initial memory allocated for remote userpd, minimum value : 3MB, maximum value 200MB **/ + /** Unsupported for unsigned user PD, for unsigned user PD init mem size is fixed at 5MB **/ + uint32_t pd_initmem_size; +}; + +/** + * Structure for remote_session_control, + * used with FASTRPC_RESERVE_SESSION request ID + * to reserve new fastrpc session of the user PD on the DSP. + * Default sesion is always 0 and remains available for any module opened without Session ID. + * New session reservation starts with session ID 1. +**/ +typedef struct remote_rpc_reserve_new_session { + /** @param[in] : Domain name of DSP, on which session need to be reserved */ + char *domain_name; + /** @param[in] : Domain name length, without NULL character */ + uint32_t domain_name_len; + /** @param[in] : Session name of the reserved sesssion */ + char *session_name; + /** @param[in] : Session name length, without NULL character */ + uint32_t session_name_len; + /** @param[out] : Effective Domain ID is the identifier of the session. + * Effective Domain ID is the unique identifier representing the session(PD) on DSP. + * Effective Domain ID needs to be used in place of Domain ID when application has multiple sessions. + */ + uint32_t effective_domain_id; + /** @param[out] : Session ID of the reserved session. + * An application can have multiple sessions(PDs) created on DSP. + * session_id 0 is the default session. Clients can reserve session starting from 1. + * Currently only 2 sessions are supported session_id 0 and session_id 1. + */ + uint32_t session_id; +} remote_rpc_reserve_new_session_t; + +/** + * Structure for remote_session_control, + * used with FASTRPC_GET_EFFECTIVE_DOMAIN_ID request ID + * to get effective domain id of fastrpc session on the user PD of the DSP +**/ +typedef struct remote_rpc_effective_domain_id { + /** @param[in] : Domain name of DSP */ + char *domain_name; + /** @param[in] : Domain name length, without NULL character */ + uint32_t domain_name_len; + /** @param[in] : Session ID of the reserved session. 0 can be used for Default session */ + uint32_t session_id; + /** @param[out] : Effective Domain ID of session */ + uint32_t effective_domain_id; +} remote_rpc_effective_domain_id_t; + +/** + * Structure for remote_session_control, + * used with FASTRPC_GET_URI request ID + * to get the URI needed to load the module in the fastrpc user PD on the DSP +**/ +typedef struct remote_rpc_get_uri { + /** @param[in] : Domain name of DSP */ + char *domain_name; + /** @param[in] : Domain name length, without NULL character */ + uint32_t domain_name_len; + /** @param[in] : Session ID of the reserved session. 0 can be used for Default session */ + uint32_t session_id; + /** @param[in] : URI of the module, found in the auto-generated header file*/ + char *module_uri ; + /** @param[in] : Module URI length, without NULL character */ + uint32_t module_uri_len; + /** @param[out] : URI containing module, domain and session. + * Memory for uri need to be pre-allocated with session_uri_len size. + * Typically session_uri_len is 30 characters more than Module URI length. + * If size of uri is beyond session_uri_len, remote_session_control fails with AEE_EBADSIZE + */ + char *uri ; + /** @param[in] : URI length */ + uint32_t uri_len; +} remote_rpc_get_uri_t; + +/** + * Structure for remote_session_control, used with FASTRPC_CONTEXT_CREATE request, + * to create a multidomain fastrpc context +**/ +typedef struct fastrpc_context_create { + /** @param[in] : List of effective domain IDs on which session needs to be + created. Needs to be allocated and populated by user. + A new effective domain id CANNOT be added to an existing context. */ + uint32_t *effec_domain_ids; + + /** @param[in] : Number of domain ids. + Size of effective domain ID array. */ + uint32_t num_domain_ids; + + /** @param[in] : Type of create request (unused) */ + uint64_t flags; + + /** @param[out] : Multi-domain context handle */ + uint64_t ctx; +} fastrpc_context_create; + +/** struct to be used with FASTRPC_CONTEXT_DESTROY request ID */ +typedef struct fastrpc_context_destroy { + /** @param[in] : Fastrpc multi-domain context */ + uint64_t ctx; + + /** @param[in] : Type of destroy request (unused) */ + uint64_t flags; +} fastrpc_context_destroy; + +/** + * Structure used for request ID `FASTRPC_MAX_THREAD_PARAM` + * in remote session control interface, to set max threads for + * unsigned PD. + **/ +struct remote_rpc_set_max_thread { +/** @param[in] : CDSP_DOMAIN_ID */ + int domain; +/** @param[in] : Max thread config for unsigned PD Minimum value : 128, maximum value 256. */ + unsigned int max_num_threads; +}; + +/** + * Request IDs for remote session control interface + **/ +enum session_control_req_id { + /** Reserved */ + FASTRPC_RESERVED_1, + + /** Set thread parameters like priority and stack size */ + FASTRPC_THREAD_PARAMS, + + /** Handle the unsigned module offload request, to be called before remote_handle_open() */ + DSPRPC_CONTROL_UNSIGNED_MODULE, + + /** Reserved */ + FASTRPC_RESERVED_2, + + /** To increase/decrease default thread priority */ + FASTRPC_RELATIVE_THREAD_PRIORITY, + + /** Reserved */ + FASTRPC_RESERVED_3, + + /** Kill remote process */ + FASTRPC_REMOTE_PROCESS_KILL, + + /** Close all open handles of requested domain */ + FASTRPC_SESSION_CLOSE, + + /** Enable PD dump feature */ + FASTRPC_CONTROL_PD_DUMP, + + /** Trigger Exception in the remote process */ + FASTRPC_REMOTE_PROCESS_EXCEPTION, + + /** Query type of process defined by enum fastrpc_process_type */ + FASTRPC_REMOTE_PROCESS_TYPE, + + /** Enable DSP User process status notifications */ + FASTRPC_REGISTER_STATUS_NOTIFICATIONS, + + /** Set signed userpd initial memory size */ + FASTRPC_PD_INITMEM_SIZE, + + /** Reserve new FastRPC session */ + FASTRPC_RESERVE_NEW_SESSION, + + /** Get effective domain ID of a FastRPC session */ + FASTRPC_GET_EFFECTIVE_DOMAIN_ID, + + /** Creates the URI needed to load a module in the DSP User PD */ + FASTRPC_GET_URI, + + /** Set max thread value for unsigned PD */ + FASTRPC_MAX_THREAD_PARAM, + + /** Create or attaches to remote session(s) on one or more domains */ + FASTRPC_CONTEXT_CREATE, + + /** Destroy or detach from remote sessions */ + FASTRPC_CONTEXT_DESTROY, +}; + + +/** + * Memory map control flags for using with remote_mem_map() and remote_mem_unmap() + **/ +enum remote_mem_map_flags { +/** + * Create static memory map on remote process with default cache configuration (writeback). + * Same remoteVirtAddr will be assigned on remote process when fastrpc call made with local virtual address. + * @Map lifetime + * Life time of this mapping is until user unmap using remote_mem_unmap or session close. + * No reference counts are used. Behavior of mapping multiple times without unmap is undefined. + * @Cache maintenance + * Driver clean caches when virtual address passed through RPC calls defined in IDL as a pointer. + * User is responsible for cleaning cache when remoteVirtAddr shared to DSP and accessed out of fastrpc method invocations on DSP. + * @recommended usage + * Map buffers which are reused for long time or until session close. This helps to reduce fastrpc latency. + * Memory shared with remote process and accessed only by DSP. + */ + REMOTE_MAP_MEM_STATIC, + +/** Update REMOTE_MAP_MAX_FLAG when adding new value to this enum **/ + }; + +/** + * @enum fastrpc_map_flags for fastrpc_mmap and fastrpc_munmap + * @brief Types of maps with cache maintenance + */ +enum fastrpc_map_flags { + /** + * Map memory pages with RW- permission and CACHE WRITEBACK. + * Driver will clean cache when buffer passed in a FastRPC call. + * Same remote virtual address will be assigned for subsequent + * FastRPC calls. + */ + FASTRPC_MAP_STATIC, + + /** Reserved for compatibility with deprecated flag */ + FASTRPC_MAP_RESERVED, + + /** + * Map memory pages with RW- permission and CACHE WRITEBACK. + * Mapping tagged with a file descriptor. User is responsible for + * maintenance of CPU and DSP caches for the buffer. Get virtual address + * of buffer on DSP using HAP_mmap_get() and HAP_mmap_put() functions. + */ + FASTRPC_MAP_FD, + + /** + * Mapping delayed until user calls HAP_mmap() and HAP_munmap() + * functions on DSP. User is responsible for maintenance of CPU and DSP + * caches for the buffer. Delayed mapping is useful for users to map + * buffer on DSP with other than default permissions and cache modes + * using HAP_mmap() and HAP_munmap() functions. + */ + FASTRPC_MAP_FD_DELAYED, + + /** Reserved for compatibility **/ + FASTRPC_MAP_RESERVED_4, + FASTRPC_MAP_RESERVED_5, + FASTRPC_MAP_RESERVED_6, + FASTRPC_MAP_RESERVED_7, + FASTRPC_MAP_RESERVED_8, + FASTRPC_MAP_RESERVED_9, + FASTRPC_MAP_RESERVED_10, + FASTRPC_MAP_RESERVED_11, + FASTRPC_MAP_RESERVED_12, + FASTRPC_MAP_RESERVED_13, + FASTRPC_MAP_RESERVED_14, + FASTRPC_MAP_RESERVED_15, + + /** + * This flag is used to skip CPU mapping, + * otherwise behaves similar to FASTRPC_MAP_FD_DELAYED flag. + */ + FASTRPC_MAP_FD_NOMAP, + + /** + * The below two flags work the same as FASTRPC_MAP_FD and FASTRPC_MAP_FD_DELAYED + * but allow the user to map into the extended address space on DSP + */ + + FASTRPC_MAP_FD_EXTENDED, + FASTRPC_MAP_FD_DELAYED_EXTENDED, + + /** Update FASTRPC_MAP_MAX when adding new value to this enum **/ +}; + +#define MAX_DOMAIN_NAMELEN 30 + +/* Position of domain type in flags */ +#define DOMAINS_LIST_FLAGS_TYPE_POS 5 + +/* Helper macro to set domain type in flags */ +#define DOMAINS_LIST_FLAGS_SET_TYPE(flags, type) (flags | (type & ((1 << DOMAINS_LIST_FLAGS_TYPE_POS) - 1))) + +/** + * @enum fastrpc_domain_type + * @brief Indicates the type of domains (DSPs) present in the system + */ +typedef enum { + /** Flag to be used to query list of all available domains */ + ALL_DOMAINS, + NSP, + LPASS, + SDSP, + MODEM, + HPASS, +} fastrpc_domain_type; + +/** + * @struct fastrpc_domain + * @brief Describes the details of each domain + */ +typedef struct { + /** + * @param : Logical domain id of the dsp. + * This can be used to query the capabilities of the dsp and + * can change with every reboot of device depending on the order + * of domain enumeration. + * This is NOT the same as effective domain id. To get the effective + * domain id of a particular session on this domain, pass the corresponding + * domain name with the `FASTRPC_GET_EFFECTIVE_DOMAIN_ID` request. + */ + int id; + + /** + * @param : Name of domain. + * To be appended with module uri while opening remote handle, + * or for querying the effective domain id on a specific session + * on this domain. + */ + char name[MAX_DOMAIN_NAMELEN]; + + /** + * @param : Type of DSP, of 'fastrpc_domain_type'. + */ + fastrpc_domain_type type; + + /** + * @param : Status of domain: 0 if domain is down + * non-zero if domain is up + */ + int status; + + /** + * @param : Card on which domain is present (for future use). + */ + uint32_t card; + + /** + * @param : SoC on which domain is present (for future use). + */ + uint32_t soc_id; +} fastrpc_domain; + +/** + * @struct fastrpc_domain_info + * @brief Structure used with 'FASTRPC_GET_DOMAINS' request id + * to query the list of available domains in the system. + */ +typedef struct { + /** + * @param[in/out] : Domains-info array pointer. + * Array needs to be allocated by client with size of array specified + * in 'max_domains'. Array will be populated by fastrpc with list of + * available domains. + * To query number of domains first, pass NULL pointer. + */ + fastrpc_domain *domains; + + /** + * @param[in] : Size of the 'domains' array allocated by user. + * This has to be greater than or equal to the actual number of available + * domains. To query number of domains first, pass 0 in this field. + */ + int max_domains; + + /** + * @param[out] : This field will be populated with the total number + * of available domains. While reading the domains-info in the array, + * read only until 'num_domains' elements. + */ + int num_domains; + + /** + * @param[in] : Bit-mask for the type of request, to be populated by client. + * Bits 0-4 : Type of domains to be queried of 'fastrpc_domain_type'. + * Only domains of this type will be returned in the 'domains' array. + * To get list of all available domains, use 'ALL_DOMAINS' type. + * Other bits reserved for future use. + */ + uint64_t flags; +} fastrpc_domains_info; + +/** + * @enum system_req_id + * @brief Requst ID to obtain information of available domains + */ +typedef enum { + /** Query list of available domains */ + FASTRPC_GET_DOMAINS = 0 +} system_req_id; + +/** + * @struct system_req_payload + * @brief Payload for remote_system_request API + */ +typedef struct { + system_req_id id; + union { + fastrpc_domains_info sys; + }; +} system_req_payload; + +/** + * remote_system_request + * API to get system info like list of available domains + * @param req, payload containing system info and request ID + * @return, 0 on Success + */ +int remote_system_request(system_req_payload *req); + +/** + * Attributes for remote_register_buf_attr/remote_register_buf_attr2 + **/ +#define FASTRPC_ATTR_NONE 0 /** No attribute to set.*/ +#define FASTRPC_ATTR_NON_COHERENT 2 /** Attribute to map a buffer as dma non-coherent, + Driver perform cache maintenance.*/ +#define FASTRPC_ATTR_COHERENT 4 /** Attribute to map a buffer as dma coherent, + Driver skips cache maintenenace + It will be ignored if a device is marked as dma-coherent in device tree.*/ +#define FASTRPC_ATTR_KEEP_MAP 8 /** Attribute to keep the buffer persistant + until unmap is called explicitly.*/ +#define FASTRPC_ATTR_NOMAP 16 /** Attribute for secure buffers to skip + smmu mapping in fastrpc driver*/ +#define FASTRPC_ATTR_FORCE_NOFLUSH 32 /** Attribute to map buffer such that flush by driver is skipped for that particular buffer + client has to perform cache maintenance*/ +#define FASTRPC_ATTR_FORCE_NOINVALIDATE 64 /** Attribute to map buffer such that invalidate by driver is skipped for that particular buffer + client has to perform cache maintenance */ +#define FASTRPC_ATTR_TRY_MAP_STATIC 128 /** Attribute for persistent mapping a buffer + to remote DSP process during buffer registration + with FastRPC driver. This buffer will be automatically + mapped during fastrpc session open and unmapped either + at unregister or session close. FastRPC library tries + to map buffers and ignore errors in case of failure. + pre-mapping a buffer reduces the FastRPC latency. + This flag is recommended only for buffers used with + latency critical rpc calls */ + + +/** + * REMOTE_MODE_PARALLEL used with remote_set_mode + * This is the default mode for the driver. While the driver is in parallel + * mode it will try to invalidate output buffers after it transfers control + * to the dsp. This allows the invalidate operations to overlap with the + * dsp processing the call. This mode should be used when output buffers + * are only read on the application processor and only written on the aDSP. + */ +#define REMOTE_MODE_PARALLEL 0 + +/** + * REMOTE_MODE_SERIAL used with remote_set_mode + * When operating in SERIAL mode the driver will invalidate output buffers + * before calling into the dsp. This mode should be used when output + * buffers have been written to somewhere besides the aDSP. + */ +#define REMOTE_MODE_SERIAL 1 + + +#ifdef _WIN32 +#include "remote_wos_ext.h" /** For function pointers of remote APIs */ +#endif + + +/** + * remote_handle()_open + * Opens a remote_handle "name" + * returns 0 on success + **/ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_open)(__QAIC_IN_CHAR const char* name, __QAIC_OUT remote_handle *ph) __QAIC_REMOTE_ATTRIBUTE; +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_open)( __QAIC_IN_CHAR const char* name, __QAIC_OUT remote_handle64 *ph) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * invokes the remote handle + * see retrive macro's on dwScalars format + * pra, contains the arguments in the following order, inbufs, outbufs, inhandles, outhandles. + * implementors should ignore and pass values asis that the transport doesn't understand. + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_invoke)(__QAIC_IN remote_handle h, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE; +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_invoke)(__QAIC_IN remote_handle64 h, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_handle()_close + * closes the remote handle + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_close)(__QAIC_IN remote_handle h) __QAIC_REMOTE_ATTRIBUTE; +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_close)(__QAIC_IN remote_handle64 h) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_handle_control + * Set remote handle control parameters + * + * @param req, request ID defined by handle_control_req_id + * @param data, address of structure with parameters + * @param datalen, length of data + * @retval, 0 on success + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_control)(__QAIC_IN uint32_t req, __QAIC_IN_LEN(datalen) void* data, __QAIC_IN uint32_t datalen) __QAIC_REMOTE_ATTRIBUTE; +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_control)(__QAIC_IN remote_handle64 h, __QAIC_IN uint32_t req, __QAIC_IN_LEN(datalen) void* data, __QAIC_IN uint32_t datalen) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_session_control + * Set remote session parameters + * + * @param req, request ID + * @param data, address of structure with parameters + * @param datalen, length of data + * @retval, 0 on success + * remote_session_control with FASTRPC_REMOTE_PROCESS_KILL req ID, possible error codes + * are AEE_ENOSUCH, AEE_EBADPARM, AEE_EINVALIDDOMAIN. Other than this errors codes treated as + * retuned from fastRPC framework. + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_session_control)(__QAIC_IN uint32_t req, __QAIC_IN_LEN(datalen) void *data, __QAIC_IN uint32_t datalen) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_handle()_invoke_async + * invokes the remote handle asynchronously + * + * desc, descriptor contains type of asyncjob. context and call back function(if any) + * see retrive macro's on dwScalars format + * pra, contains the arguments in the following order, inbufs, outbufs, inhandles, outhandles. + * all outbufs need to be either allocated using rpcmem_alloc or registered ION buffers using register_buf + * implementors should ignore and pass values as is that the transport doesn't understand. + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_invoke_async)(__QAIC_IN remote_handle h, __QAIC_IN fastrpc_async_descriptor_t *desc, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE; +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_invoke_async)(__QAIC_IN remote_handle64 h, __QAIC_IN fastrpc_async_descriptor_t *desc, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * fastrpc_async_get_status + * Get status of Async job. This can be used to query the status of a Async job + * + * @param jobid, jobid returned during Async job submission. + * @param timeout_us, timeout in micro seconds + * timeout = 0, returns immediately with status/result + * timeout > 0, waits for specified time and then returns with status/result + * timeout < 0. waits indefinetely until job completes + * @param result, integer pointer for the result of the job + * 0 on success + * error code on failure + * @retval, 0 on job completion and result of job is part of @param result + * AEE_EBUSY, if job status is pending and is not returned from DSP + * AEE_EBADPARM, if job id is invalid + * AEE_EFAILED, FastRPC internal error + * + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(fastrpc_async_get_status)(__QAIC_IN fastrpc_async_jobid jobid,__QAIC_IN int timeout_us,__QAIC_OUT int *result); + + +/** + * fastrpc_release_async_job + * Release Async job after receiving status either through callback/poll + * + * @param jobid, jobid returned during Async job submission. + * @retval, 0 on success + * AEE_EBUSY, if job status is pending and is not returned from DSP + * AEE_EBADPARM, if job id is invalid + * + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(fastrpc_release_async_job)(__QAIC_IN fastrpc_async_jobid jobid); + + +/** + * remote_mmap + * map memory to the remote domain + * + * @param fd, fd assosciated with this memory + * @param flags, flags to be used for the mapping + * @param vaddrin, input address + * @param size, size of buffer + * @param vaddrout, output address + * @retval, 0 on success + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_mmap)(__QAIC_IN int fd, __QAIC_IN uint32_t flags, __QAIC_IN uint32_t vaddrin, __QAIC_IN int size, __QAIC_OUT uint32_t* vaddrout) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_munmap + * unmap memory from the remote domain + * + * @param vaddrout, remote address mapped + * @param size, size to unmap. Unmapping a range partially may not be supported. + * @retval, 0 on success, may fail if memory is still mapped + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_munmap)(__QAIC_IN uint32_t vaddrout, __QAIC_IN int size) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_mem_map + * Map memory to the remote process on a selected DSP domain + * + * @domain: DSP domain ID. Use -1 for using default domain. + * Default domain is selected based on library lib(a/m/s/c)dsprpc.so library linked to application. + * @fd: file descriptor of memory + * @flags: enum remote_mem_map_flags type of flag + * @virtAddr: virtual address of buffer + * @size: buffer length + * @remoteVirtAddr[out]: remote process virtual address + * @retval, 0 on success + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_mem_map)(__QAIC_IN int domain, __QAIC_IN int fd, __QAIC_IN int flags, __QAIC_IN uint64_t virtAddr, __QAIC_IN size_t size, __QAIC_OUT uint64_t* remoteVirtAddr) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_mem_unmap + * Unmap memory to the remote process on a selected DSP domain + * + * @domain: DSP domain ID. Use -1 for using default domain. Get domain from multi-domain handle if required. + * @remoteVirtAddr: remote process virtual address received from remote_mem_map + * @size: buffer length + * @retval, 0 on success + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_mem_unmap)(__QAIC_IN int domain, __QAIC_IN uint64_t remoteVirtAddr, __QAIC_IN size_t size) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_mmap64 + * map memory to the remote domain + * + * @param fd, fd associated with this memory + * @param flags, flags to be used for the mapping + * @param vaddrin, input address + * @param size, size of buffer + * @param vaddrout, output address + * @retval, 0 on success + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_mmap64)(__QAIC_IN int fd, __QAIC_IN uint32_t flags, __QAIC_IN __QAIC_INT64PTR vaddrin, __QAIC_IN int64_t size, __QAIC_OUT __QAIC_INT64PTR *vaddrout) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_munmap64 + * unmap memory from the remote domain + * + * @param vaddrout, remote address mapped + * @param size, size to unmap. Unmapping a range partially may not be supported. + * @retval, 0 on success, may fail if memory is still mapped + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_munmap64)(__QAIC_IN __QAIC_INT64PTR vaddrout, __QAIC_IN int64_t size) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * fastrpc_mmap + * Creates a mapping on remote process for an ION buffer with file descriptor. New fastrpc session + * will be opened if not already opened for the domain. + * + * @param domain, DSP domain ID of a fastrpc session + * @param fd, ION memory file descriptor + * @param addr, buffer virtual address on cpu + * @param offset, offset from the beginning of the buffer + * @param length, size of buffer in bytes + * @param flags, controls mapping functionality on DSP. Refer fastrpc_map_flags enum definition for more information. + * + * @return, 0 on success, error code on failure. + * AEE_EALREADY Buffer already mapped. Multiple mappings for the same buffer are not supported. + * AEE_EBADPARM Bad parameters + * AEE_EFAILED Failed to map buffer + * AEE_ENOMEMORY Out of memory (internal error) + * AEE_EUNSUPPORTED Unsupported API on the target + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(fastrpc_mmap)(__QAIC_IN int domain, __QAIC_IN int fd, __QAIC_IN void *addr, __QAIC_IN int offset, __QAIC_IN size_t length, __QAIC_IN enum fastrpc_map_flags flags)__QAIC_REMOTE_ATTRIBUTE; + + +/** + * fastrpc_munmap + * Removes a mapping associated with file descriptor. + * + * @param domain, DSP domain ID of a fastrpc session + * @param fd, file descriptor + * @param addr, buffer virtual address used for mapping creation + * @param length, buffer length + * + * @return, 0 on success, error code on failure. + * AEE_EBADPARM Bad parameters + * AEE_EINVALIDFD Mapping not found for specified fd + * AEE_EFAILED Failed to map buffer + * AEE_EUNSUPPORTED Unsupported API on the target + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(fastrpc_munmap)(__QAIC_IN int domain, __QAIC_IN int fd, __QAIC_IN void *addr, __QAIC_IN size_t length)__QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_register_buf/remote_register_buf_attr + * Register a file descriptor for a buffer. + * Users of fastrpc should register zero-copy buffer to enable + * sharing that buffer to the dsp via the SMMU. The API is limited + * to register buffer less than 2 GB only. Recommendation is to use + * remote_register_buf_attr2 instead. API remote_register_buf_attr2 + * can now accept size up to 2 power(8*sizeof(size_t)). + * + * Some versions of libcdsprpc.so lack this + * function, so users should set this symbol as weak. + * + * #pragma weak remote_register_buf + * #pragma weak remote_register_buf_attr + * + * @param buf, virtual address of the buffer + * @param size, size of the buffer + * @fd, the file descriptor, callers can use -1 to deregister. + * @attr, map buffer as coherent or non-coherent + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN void __QAIC_REMOTE(remote_register_buf)(__QAIC_IN_LEN(size) void* buf, __QAIC_IN int size, __QAIC_IN int fd) __QAIC_REMOTE_ATTRIBUTE; +__QAIC_REMOTE_EXPORT __QAIC_RETURN void __QAIC_REMOTE(remote_register_buf_attr)(__QAIC_IN_LEN(size) void* buf, __QAIC_IN int size, __QAIC_IN int fd, __QAIC_IN int attr) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_register_buf_attr2 + * Register a file descriptor for a buffer. Users of fastrpc should + * register zero-copy buffer to enable sharing that buffer to the + * dsp via the SMMU. + * + * Some versions of libcdsprpc.so lack this + * function, so users should set this symbol as weak. + * + * #pragma weak remote_register_buf_attr2 + * + * @param buf, virtual address of the buffer + * @param size, size of the buffer + * @fd, the file descriptor, callers can use -1 to deregister. + * @attr, setting attribute for the mapped buffer + * refer to "Attributes for remote_register_buf_attr/remote_register_buf_attr2" + * to set the required attribute value. + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN void __QAIC_REMOTE(remote_register_buf_attr2)(__QAIC_IN_LEN(size) void* buf, __QAIC_IN size_t size, __QAIC_IN int fd, __QAIC_IN int attr) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_register_dma_handle/remote_register_dma_handle_attr + * Register a dma handle with fastrpc. + * This is only valid on Android with ION allocated memory. + * Users of fastrpc should register a file descriptor allocated with + * ION to enable sharing that memory to the dsp via the SMMU. + * + * Some versions of libadsprpc.so lack this function, + * so users should set this symbol as weak. + * + * #pragma weak remote_register_dma_handle + * #pragma weak remote_register_dma_handle_attr + * + * @fd, the file descriptor, callers can use -1 to deregister. + * @param len, size of the buffer + * @attr, map buffer as coherent or non-coherent or no-map + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_register_dma_handle)(__QAIC_IN int fd,__QAIC_IN uint32_t len) __QAIC_REMOTE_ATTRIBUTE; +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_register_dma_handle_attr)(__QAIC_IN int fd,__QAIC_IN uint32_t len,__QAIC_IN uint32_t attr) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_set_mode + * Set the mode of operation. + * + * Some versions of libadsprpc.so lack this function, + * so users should set this symbol as weak. + * + * #pragma weak remote_set_mode + * + * @param mode, the mode + * @retval, 0 on success + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_set_mode)(__QAIC_IN uint32_t mode) __QAIC_REMOTE_ATTRIBUTE; + + +/** + * remote_register_fd + * Register a file descriptor. + * This can be used when users do not have a mapping to pass to the + * RPC layer. The generated address is a mapping with PROT_NONE, any + * access to this memory will fail, so it should only be used as an + * ID to identify this file descriptor to the RPC layer. This API is + * limited to buffer size less then 2 GB. Recommendation is to use + * remote_register_fd2 for buffer of size > 2 power(8*sizeof(size_t)). + * + * To deregister use remote_register_buf(addr, size, -1). + * + * #pragma weak remote_register_fd + * + * @param fd, the file descriptor. + * @param size, size to of the buffer + * @retval, (void*)-1 on failure, address on success. + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN void *__QAIC_REMOTE(remote_register_fd)(__QAIC_IN int fd,__QAIC_IN int size) __QAIC_REMOTE_ATTRIBUTE; + +/** + * remote_register_fd2 + * Register a file descriptor. + * This can be used when users do not have a mapping to pass to + * the RPC layer. The generated address is a mapping with PROT_NONE, + * any access to this memory will fail, so it should only be used + * as an ID to identify this file descriptor to the RPC layer. + * + * To deregister use remote_register_buf(addr, size, -1). + * + * #pragma weak remote_register_fd2 + * + * @param fd, the file descriptor. + * @param size, size to of the buffer + * @retval, (void*)-1 on failure, address on success. + */ +__QAIC_REMOTE_EXPORT __QAIC_RETURN void *__QAIC_REMOTE(remote_register_fd2)(__QAIC_IN int fd,__QAIC_IN size_t size) __QAIC_REMOTE_ATTRIBUTE; + + +#ifdef __cplusplus +} +#endif + +#endif /// REMOTE_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.idl b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.idl new file mode 100755 index 0000000000000..65e5b162660b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.idl @@ -0,0 +1,32 @@ +interface remote_handle64 { + /** + * Opens the handle in the specified domain. If this is the first + * handle, this creates the session. Typically this means opening + * the device, aka open("/dev/adsprpc-smd"), then calling ioctl + * device APIs to create a PD on the DSP to execute our code in, + * then asking that PD to dlopen the .so and dlsym the skel function. + * + * @param uri, _URI"&_dom=aDSP" + * _URI is a QAIC generated uri, or + * "file:///?_skel_handle_invoke&_modver=1.0" + * If the _dom parameter is not present, _dom=DEFAULT is assumed + * but not forwarded. + * Reserved uri keys: + * [0]: first unamed argument is the skel invoke function + * _dom: execution domain name, _dom=mDSP/aDSP/DEFAULT + * _modver: module version, _modver=1.0 + * _*: any other key name starting with an _ is reserved + * Unknown uri keys/values are forwarded as is. + * @param h, resulting handle + * @retval, 0 on success + */ + long open(in string uri, rout remote_handle64 h); + /** + * Closes a handle. If this is the last handle to close, the session + * is closed as well, releasing all the allocated resources. + + * @param h, the handle to close + * @retval, 0 on success, should always succeed + */ + long close(in remote_handle64 h); +}; diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.md new file mode 100755 index 0000000000000..0c3d9daca2aca --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.md @@ -0,0 +1,120 @@ +# Remote session API to interface with FastRPC + + +## Overview + +FastRPC exposes a set of APIs enabling the following functionality: + + - open, configure and close a remote session on the DSP + - enable unsigned PD offload to the compute DSP + - enable and manage QoS mode + - make synchronous or asynchronous remote calls + - query DSP capabilities + - map or unmap pages onto the DSP + +The 64-bit version of the API (`handle64`) enables multi-domain modules. +It is recommended for applications to use the multi-domain framework, +which provides multiple benefits over the older single-domain framework. remote_handle_* +APIs should be used for single-domain applications. For more information on multi-domain support, +refer to the RPC section in the Hexagon SDK documentation. + +# `remote_handle_open`, `remote_handle64_open` +Loads the shared object on the remote process domain. + +# `remote_handle_invoke`, `remote_handle64_invoke` +Executes a process on the remote domain. + +# `remote_handle_close`, `remote_handle64_close` +Closes the remote handle opened by the remote process. + +# `remote_handle_control`, `remote_handle64_control` +Manages the remote session. +This API allows to control or query the remote session: +- Control latency + The user can vote for a specific latency requirement per session. This latency is not guaranteed by the driver. The driver will try to + meet this requirement with the options available on a given target. Based on the arguments, either PM-QoS [Power Management] or adaptive + QoS can be enabled. + + PM-QoS is recommended for latency-sensitive use cases, whereas adaptive QoS is recommended for moderately latency-sensitive use cases. + Adaptive QoS is more power-efficient than PM-QoS. + + If PM-QoS enabled, CPU low power modes will be disabled. + + If Adaptive QoS is enabled, the remote DSP starts keeping track of the method execution times for that process. Once enough data is available, + the DSP will try to predict when the method will finish executing and will send a "ping" to wake up the CPU prior to the completion of the + DSP task so that there is no extra overhead due to CPU wakeup time. + +- Enable wake lock + Keep the CPU up until a response from the remote invocation call is received. Disabling wake lock feature enables the CPU to be in suspend mode. + +- Query DSP Capabilities + Queries DSP support for: + + * domains available + * unsigned PD + * HVX, VTCM, HMX + * async FastRPC + * remote PD status notification + +- Get DSP domain + Returns the current DSP domain. + +# `remote_session_control` + +Sets remote session parameters such as thread stack size or unsigned PD mode. Enables to kill remote process, closes sessions on the DSP, +generates a PD dump, or triggers remote process exceptions. + +- [Stack thread parameters](structremote__rpc__thread__params.html)
+ Parameters to configure a thread: priority and stack size. + +- [Unsigned PD](structremote__rpc__control__unsigned__module.html)
+ Flag to configure the session as unsigned. This allows third party applications to run compute + intensive tasks on the compute DSP for better performance. + +- [Kill remote process](structremote__rpc__process__clean__params.html)
+ Kills the remote process running on the DSP. + +- [Session close](structremote__rpc__session__close.html)
+ Closes all sessions open on a given domain. + +- [PD dump](structremote__rpc__control__pd__dump.html)
+ Enables PD dump feature. + +- [Remote process exception](structremote__rpc__process__clean__params.html)
+ Introduces an exception in the remote process. + +- [Query process type](structremote__process__type.html)
+ Query the type of process (signed or unsigned) running on remote DSP. + +- [Relative thread priority](structremote__rpc__relative__thread__priority.html)
+ Set a lower or higher priority than the default thread priority, for the user threads on the DSP. + +# `fastrpc_mmap`, `fastrpc_munmap` +Creates a new mapping of an ION memory into the virtual address space of a remote process on the DSP and associates the mapping with the +provided file descriptor. The parameter `flags` of type `fastrpc_map_flags` allows the user to control the page permissions and other +properties of the memory map. These mappings can be destroyed with `fastrpc_munmap()` API using the file descriptor. APIs `fastrpc_mmap` +and `fastrpc_munmap` are available and their use is recommended for Lahaina and later chipsets. + +# `remote_mem_map`, `remote_mem_unmap` +Maps/unmaps large buffers statically on a given DSP. +Mapping the buffers statically saves the latency for the corresponding remote calls associated with these buffers. +These APIs are available only on SM8250 (Kona) or later targets. + +# `remote_handle_invoke_async`, `remote_handle64_invoke_async` +Make remote invocations asynchronous. Running asynchronously does not improve the latency but improves the throughput by enabling the DSP +to run successive tasks continuously. This feature is supported on Lahaina and onward targets. + +# `fastrpc_async_get_status` +Queries the status of the asynchronous job. + +# `fastrpc_release_async_job` +Releases the asynchronous job after receiving the status either through callback or poll. + +# `remote_register_buf`, `remote_register_buf_attr` +Registers a file descriptor for a buffer allocated with ION memory to share the memory with the DSP via SMMU. + +# `remote_register_dma_handle`, `remote_register_dma_handle_attr` +Registers a DMA handle allocated with ION memory to share the memory with the DSP via SMMU. + +Header file: @b remote.h + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote64.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote64.h new file mode 100755 index 0000000000000..07cf3d5d6e769 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote64.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2014, 2022 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + */ +#ifndef REMOTE64_H +#define REMOTE64_H + +#include "remote.h" + +/* +All the functions declared here are moved to remote.h, remote64.h will be deleted in future. +*/ +#endif // REMOTE64_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStd.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStd.md new file mode 100755 index 0000000000000..3a02dd209389b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStd.md @@ -0,0 +1,11 @@ +# Standard definitions and error codes + +## Standard definitions + +AEEStdDef.h contains definitions of common data types used on the Hexagon DSPs and the application processor. It also has definitions of MIN, MAX values for common data types. + +## Standard error codes + +AEEStdErr.h file contains error codes returned by functions running on the DSPs and the application processor. +The application invoking these APIs is expected to check for the error codes and implement appropriate error handling. Each API in the header files will have required documentation on the error codes it can return. + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.h new file mode 100755 index 0000000000000..fe252dd6f0590 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.h @@ -0,0 +1,464 @@ +#ifndef AEESTDDEF_H +#define AEESTDDEF_H +/* +======================================================================= + +FILE: AEEStdDef.h + +DESCRIPTION: definition of basic types, constants, + preprocessor macros + +======================================================================= +*/ +/*============================================================================== + Copyright (c) 2005,2007,2012-2013, 2020 Qualcomm Technologies, Inc. + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +==============================================================================*/ + +#include + +#if defined(COMDEF_H) /* guards against a known re-definer */ +#define _BOOLEAN_DEFINED +#define _UINT32_DEFINED +#define _UINT16_DEFINED +#define _UINT8_DEFINED +#define _INT32_DEFINED +#define _INT16_DEFINED +#define _INT8_DEFINED +#define _UINT64_DEFINED +#define _INT64_DEFINED +#define _BYTE_DEFINED +#endif /* #if !defined(COMDEF_H) */ + +/* ----------------------------------------------------------------------- +** Standard Types +** ----------------------------------------------------------------------- */ + +/* The following definitions are the same accross platforms. This first +** group are the sanctioned types. +*/ +/** @defgroup stddef standard data type definitions +* @{ +*/ +#ifndef _BOOLEAN_DEFINED +typedef unsigned char boolean; /**< Boolean value type. */ +#define _BOOLEAN_DEFINED +#endif + +#ifndef _UINT32_DEFINED +typedef uint32_t uint32; /**< Unsigned 32-bit value */ +#define _UINT32_DEFINED +#endif + +#ifndef _UINT16_DEFINED +typedef unsigned short uint16; /**< Unsigned 16-bit value */ +#define _UINT16_DEFINED +#endif + +#ifndef _UINT8_DEFINED +typedef unsigned char uint8; /**< Unsigned 8-bit value */ +#define _UINT8_DEFINED +#endif + +#ifndef _INT32_DEFINED +typedef int32_t int32; /**< Signed 32-bit value */ +#define _INT32_DEFINED +#endif + +#ifndef _INT16_DEFINED +typedef signed short int16; /**< Signed 16-bit value */ +#define _INT16_DEFINED +#endif + +#ifndef _INT8_DEFINED +typedef signed char int8; /**< Signed 8-bit value */ +#define _INT8_DEFINED +#endif + +#ifndef _INT64_DEFINED +#if defined(__GNUC__) +#define __int64 long long +#endif +typedef __int64 int64; /**< Signed 64-bit value */ +#define _INT64_DEFINED +#endif + +#ifndef _UINT64_DEFINED +typedef unsigned __int64 uint64; /**< Unsigned 64-bit value */ +#define _UINT64_DEFINED +#endif + +#ifndef _BYTE_DEFINED +typedef unsigned char byte; /**< byte type */ +#define _BYTE_DEFINED +#endif + +/** + * @} + */ + + /** @defgroup stdret standard return values +* @{ +*/ + +//! @cond Doxygen_Suppress +#ifndef _AEEUID_DEFINED +typedef uint32 AEEUID; +#define _AEEUID_DEFINED +#endif + +#ifndef _AEEIID_DEFINED +typedef uint32 AEEIID; +#define _AEEIID_DEFINED +#endif + +#ifndef _AEECLSID_DEFINED +typedef uint32 AEECLSID; +#define _AEECLSID_DEFINED +#endif + +#ifndef _AEEPRIVID_DEFINED +typedef uint32 AEEPRIVID; +#define _AEEPRIVID_DEFINED +#endif + +#ifndef _AECHAR_DEFINED +typedef uint16 AECHAR; +#define _AECHAR_DEFINED +#endif +//! @endcond + +/** + * @brief Return value of functions indicating success or failure. return value 0 indicates success. A non zero value indicates a failure. Any data in rout parameters is not propagated back. + */ +#ifndef _AEERESULT_DEFINED +typedef int AEEResult; +#define _AEERESULT_DEFINED +#endif + +/** + * @} + */ + + +/* ----------------------------------------------------------------------- +** Function Calling Conventions +** ----------------------------------------------------------------------- */ + +#ifndef CDECL +#ifdef _MSC_VER +#define CDECL __cdecl +#else +#define CDECL +#endif /* _MSC_VER */ +#endif /* CDECL */ + +/* ----------------------------------------------------------------------- +** Constants +** ----------------------------------------------------------------------- */ + /** @defgroup stdminmax Standard Min and Max for all data types +* @{ +*/ + +#ifndef TRUE +#define TRUE 1 /**< Boolean true value. */ +#endif + +#ifndef FALSE +#define FALSE 0 /**< Boolean false value. */ +#endif + +#ifndef NULL +#define NULL 0 /**< NULL = 0. */ +#endif + +#ifndef MIN_INT8 +#define MIN_INT8 -128 /**< MIN 8-bit integer */ +#endif +#ifndef MIN_INT16 +#define MIN_INT16 -32768 /**< MIN 16-bit integer */ +#endif +#ifndef MIN_INT32 +#define MIN_INT32 (~0x7fffffff) /**< MIN 32-bit unsigned */ +#endif +#ifndef MIN_INT64 +#define MIN_INT64 (~0x7fffffffffffffffLL) /**< MIN 64-bit integer */ +#endif + +#ifndef MAX_INT8 +#define MAX_INT8 127 /**< MAX 8-bit integer */ +#endif +#ifndef MAX_INT16 +#define MAX_INT16 32767 /**< MAX 16-bit integer */ +#endif +#ifndef MAX_INT32 +#define MAX_INT32 2147483647 /**< MAX 32-bit integer */ +#endif +#ifndef MAX_INT64 +#define MAX_INT64 9223372036854775807LL /**< MAX 64-bit integer */ +#endif + +#ifndef MAX_UINT8 +#define MAX_UINT8 255 /**< MAX 8-bit unsigned integer */ +#endif +#ifndef MAX_UINT16 +#define MAX_UINT16 65535 /**< MAX 16-bit unsigned integer */ +#endif +#ifndef MAX_UINT32 +#define MAX_UINT32 4294967295u /**< MAX 32-bit unsigned integer */ +#endif +#ifndef MAX_UINT64 +#define MAX_UINT64 18446744073709551615uLL /**< MAX 64-bit unsigned integer */ +#endif + +//! @cond Doxygen_Suppress +#ifndef MIN_AECHAR +#define MIN_AECHAR 0 +#endif + +#ifndef MAX_AECHAR +#define MAX_AECHAR 65535 +#endif + +//! @endcond + +/** + * @} + */ + +/* ----------------------------------------------------------------------- +** Preprocessor helpers +** ----------------------------------------------------------------------- */ +#define __STR__(x) #x +#define __TOSTR__(x) __STR__(x) +#define __FILE_LINE__ __FILE__ ":" __TOSTR__(__LINE__) + +/* ----------------------------------------------------------------------- +** Types for code generated from IDL +** ----------------------------------------------------------------------- */ + + /** @defgroup QIDL data types +* @{ +*/ +//! @cond Doxygen_Suppress +#ifndef __QIDL_WCHAR_T_DEFINED__ +#define __QIDL_WCHAR_T_DEFINED__ +typedef uint16 _wchar_t; +#endif + + +/* __STRING_OBJECT__ will be deprecated in the future */ + + +#if !defined(__QIDL_STRING_OBJECT_DEFINED__) && !defined(__STRING_OBJECT__) +#define __QIDL_STRING_OBJECT_DEFINED__ +#define __STRING_OBJECT__ + +/** + * @brief This structure is used to represent an IDL string when used inside a + sequence or union. + */ +typedef struct _cstring_s { + char* data; + int dataLen; + int dataLenReq; +} _cstring_t; + +/** + * @brief This structure is used to represent an IDL wstring when used inside a + sequence or union. + */ + +typedef struct _wstring_s { + _wchar_t* data; + int dataLen; + int dataLenReq; +} _wstring_t; +#endif /* __QIDL_STRING_OBJECT_DEFINED__ */ +//! @endcond +/** + * @} + */ +/* +======================================================================= + DATA STRUCTURES DOCUMENTATION +======================================================================= + +======================================================================= + +AEEUID + +Description: + This is a BREW unique ID. Used to express unique types, interfaces, classes + groups and privileges. The BREW ClassID Generator generates + unique IDs that can be used anywhere you need a new AEEIID, AEECLSID, + or AEEPRIVID. + +Definition: + typedef uint32 AEEUID + +======================================================================= + +AEEIID + +Description: + This is an interface ID type, used to denote a BREW interface. It is a special case + of AEEUID. + +Definition: + typedef uint32 AEEIID + +======================================================================= + +AEECLSID + +Description: + This is a classe ID type, used to denote a BREW class. It is a special case + of AEEUID. + +Definition: + typedef uint32 AEECLSID + +======================================================================= + +AEEPRIVID + +Description: + This is a privilege ID type, used to express a privilege. It is a special case + of AEEUID. + +Definition: + typedef uint32 AEEPRIVID + +======================================================================= + +AECHAR + +Description: + This is a 16-bit character type. + +Definition: + typedef uint16 AECHAR + +======================================================================= + +AEEResult + +Description: + This is the standard result type. + +Definition: + typedef int AEEResult + +======================================================================= + +_wchar_t + +Description: + This is a 16-bit character type corresponding to the IDL 'wchar' + type. + +Definition: + typedef uint16 _wchar_t + +See Also: + _cstring_t + _wstring_t + +======================================================================= + +_cstring_t + +Description: + This structure is used to represent an IDL string when used inside a + sequence or union. + +Definition: + typedef struct _cstring_s { + char* data; + int dataLen; + int dataLenReq; + } _cstring_t; + +Members: + data : A pointer to the NULL-terminated string. + dataLen : The size, in chars, of the buffer pointed to by 'data', + including the NULL terminator. This member is only used + when the structure is part of an rout or inrout + parameter, but must be supplied by the caller as an + input in these cases. + dataLenReq : The size that would have been required to store the + entire result string. This member is only used when the + structure is part of an rout or inrout parameter, when + it is an output value set by the callee. The length of + the returned string (including the NULL terminator) + after a call is the minimum of dataLen and dataLenReq. + +See Also: + _wchar_t + _wstring_t + +======================================================================= + +_wstring_t + +Description: + This structure is used to represent an IDL wstring when used inside a + sequence or union. + +Definition: + typedef struct _wstring_s { + _wchar_t* data; + int dataLen; + int dataLenReq; + } _wstring_t; + +Members: + data : A pointer to the NULL-terminated wide string. + dataLen : The size, in 16-bit characters, of the buffer pointed to + by 'data', including the NULL terminator. This member + is only used when the structure is part of an rout or + inrout parameter, but must be supplied by the caller as + an input in these cases. + dataLenReq : The number of 16-bit characters that would have been + required to store the entire result string. This member + is only used when the structure is part of an rout or + inrout parameter, when it is an output value set by the + callee. The length of the returned wstring (including + the NULL terminator) after a call is the minimum of + dataLen and dataLenReq. + +See Also: + _cstring_t + _wchar_t + +======================================================================= +*/ + +#endif /* #ifndef AEESTDDEF_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.idl b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.idl new file mode 100755 index 0000000000000..ac224152bcaf9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.idl @@ -0,0 +1,91 @@ +#ifndef AEESTDDEF_IDL +#define AEESTDDEF_IDL +//============================================================================ +/// @file AEEStdDef.idl +/// +/// This file contains definitions of primitive types. + //qidl copyright +//% Copyright (c) 2006-2014, 2020 Qualcomm Technologies, Inc. + //qidl nested=false +//% All Rights Reserved. +//% Redistribution and use in source and binary forms, with or without +//% modification, are permitted provided that the following conditions are met: +//% +//% 1. Redistributions of source code must retain the above copyright notice, +//% this list of conditions and the following disclaimer. +//% +//% 2. Redistributions in binary form must reproduce the above copyright notice, +//% this list of conditions and the following disclaimer in the documentation +//% and/or other materials provided with the distribution. +//% +//% 3. Neither the name of the copyright holder nor the names of its contributors +//% may be used to endorse or promote products derived from this software without +//% specific prior written permission. +//% +//% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +//% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +//% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +//% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +//% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +//% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +//% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +//% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +//% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +//% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +//% POSSIBILITY OF SUCH DAMAGE. +//============================================================================ + +/* NOTE: THIS FILE SHOULD NEVER BE COMPILED DIRECTLY. That is, code should + * never be generated from these definitions, as they will conflict with the + * "real" hand-written AEEStdDef.h. Note also that if the definitions here + * become out of sync with the hand-written AEEStdDef.h, bad things will + * happen. + */ + +/** + * @name Primitive Types + */ +/*@{*/ + +typedef octet byte; ///< Alternate alias for an unsigned + ///< 8-bit integer +/*@}*/ + +/** + * @name Types + */ +/*@{*/ + +/** + * This is a unique ID type. Used to express types, + * interfaces, classes, and privileges. The class ID generator generates + * unique IDs that can be used anywhere a new #AEEIID, #AEECLSID, or + * #AEEPRIVID is needed. + */ +typedef uint32 AEEUID; + +/** + * This is an interface ID type, used to denote an interface. It is a special + case of #AEEUID. + */ +typedef uint32 AEEIID; + +/** + * This is a class ID type, used to denote a class. It is a special case of + #AEEUID. + */ +typedef uint32 AEECLSID; + +/** + * This is a privilege ID type, used to express a privilege. It is a special + * case of #AEEUID. + */ +typedef uint32 AEEPRIVID; + +typedef wchar AECHAR; ///< Wide character type + +typedef long AEEResult; ///< Common return type + +/*@}*/ + +#endif /* #ifndef AEESTDDEF_IDL */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.h new file mode 100755 index 0000000000000..bc1706abe5886 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.h @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2005-2007, 2012-2013, 2019-2020 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef AEESTDERR_H +#define AEESTDERR_H +// +// Basic Error Codes +// +// +#if defined(__hexagon__) + #define AEE_EOFFSET 0x80000400 +#else + #define AEE_EOFFSET 0x00000000 +#endif +/** @defgroup stdbasicerror Basic error codes + * @{ + */ +#define AEE_SUCCESS 0 ///< No error +#define AEE_EUNKNOWN -1 ///< Unknown error (should not use this) + +#define AEE_EFAILED (AEE_EOFFSET + 0x001) ///< General failure +#define AEE_ENOMEMORY (AEE_EOFFSET + 0x002) ///< Memory allocation failed because of insufficient RAM +#define AEE_ECLASSNOTSUPPORT (AEE_EOFFSET + 0x003) ///< Specified class unsupported +#define AEE_EVERSIONNOTSUPPORT (AEE_EOFFSET + 0x004) ///< Version not supported +#define AEE_EALREADYLOADED (AEE_EOFFSET + 0x005) ///< Object already loaded +#define AEE_EUNABLETOLOAD (AEE_EOFFSET + 0x006) ///< Unable to load object/applet +#define AEE_EUNABLETOUNLOAD (AEE_EOFFSET + 0x007) ///< Unable to unload + ///< object/applet +#define AEE_EALARMPENDING (AEE_EOFFSET + 0x008) ///< Alarm is pending +#define AEE_EINVALIDTIME (AEE_EOFFSET + 0x009) ///< Invalid time +#define AEE_EBADCLASS (AEE_EOFFSET + 0x00A) ///< NULL class object +#define AEE_EBADMETRIC (AEE_EOFFSET + 0x00B) ///< Invalid metric specified +#define AEE_EEXPIRED (AEE_EOFFSET + 0x00C) ///< App/Component Expired +#define AEE_EBADSTATE (AEE_EOFFSET + 0x00D) ///< Process or thread is not in expected state +#define AEE_EBADPARM (AEE_EOFFSET + 0x00E) ///< Invalid parameter +#define AEE_ESCHEMENOTSUPPORTED (AEE_EOFFSET + 0x00F) ///< Invalid URL scheme +#define AEE_EBADITEM (AEE_EOFFSET + 0x010) ///< Value out of range +#define AEE_EINVALIDFORMAT (AEE_EOFFSET + 0x011) ///< Invalid format +#define AEE_EINCOMPLETEITEM (AEE_EOFFSET + 0x012) ///< Incomplete item, like length of a string is less that expected +#define AEE_ENOPERSISTMEMORY (AEE_EOFFSET + 0x013) ///< Insufficient flash +#define AEE_EUNSUPPORTED (AEE_EOFFSET + 0x014) ///< API not implemented +#define AEE_EPRIVLEVEL (AEE_EOFFSET + 0x015) ///< Privileges are insufficient + ///< for this operation +#define AEE_ERESOURCENOTFOUND (AEE_EOFFSET + 0x016) ///< Unable to find specified + ///< resource +#define AEE_EREENTERED (AEE_EOFFSET + 0x017) ///< Non re-entrant API + ///< re-entered +#define AEE_EBADTASK (AEE_EOFFSET + 0x018) ///< API called in wrong task + ///< context +#define AEE_EALLOCATED (AEE_EOFFSET + 0x019) ///< App/Module left memory + ///< allocated when released. +#define AEE_EALREADY (AEE_EOFFSET + 0x01A) ///< Operation is already in + ///< progress +#define AEE_EADSAUTHBAD (AEE_EOFFSET + 0x01B) ///< ADS mutual authorization + ///< failed +#define AEE_ENEEDSERVICEPROG (AEE_EOFFSET + 0x01C) ///< Need service programming +#define AEE_EMEMPTR (AEE_EOFFSET + 0x01D) ///< bad memory pointer, expected to be NULL +#define AEE_EHEAP (AEE_EOFFSET + 0x01E) ///< An internal heap error was detected +#define AEE_EIDLE (AEE_EOFFSET + 0x01F) ///< Context (system, interface, + ///< etc.) is idle +#define AEE_EITEMBUSY (AEE_EOFFSET + 0x020) ///< Context (system, interface, + ///< etc.) is busy +#define AEE_EBADSID (AEE_EOFFSET + 0x021) ///< Invalid subscriber ID +#define AEE_ENOTYPE (AEE_EOFFSET + 0x022) ///< No type detected/found +#define AEE_ENEEDMORE (AEE_EOFFSET + 0x023) ///< Need more data/info +#define AEE_EADSCAPS (AEE_EOFFSET + 0x024) ///< ADS Capabilities do not + ///< match those required for phone +#define AEE_EBADSHUTDOWN (AEE_EOFFSET + 0x025) ///< App failed to close properly +#define AEE_EBUFFERTOOSMALL (AEE_EOFFSET + 0x026) ///< Destination buffer given is + ///< too small + ///< or service exists or is + ///< valid +#define AEE_EACKPENDING (AEE_EOFFSET + 0x028) ///< ACK pending on application +#define AEE_ENOTOWNER (AEE_EOFFSET + 0x029) ///< Not an owner authorized to + ///< perform the operation +#define AEE_EINVALIDITEM (AEE_EOFFSET + 0x02A) ///< Current item is invalid, it can be a switch case or a pointer to memory +#define AEE_ENOTALLOWED (AEE_EOFFSET + 0x02B) ///< Not allowed to perform the + ///< operation +#define AEE_EINVHANDLE (AEE_EOFFSET + 0x02C) ///< Invalid handle - adding here as its defined in vendor AEEStdErr.h - needed to check valid handle in stub.c +#define AEE_EOUTOFHANDLES (AEE_EOFFSET + 0x02D) ///< Out of handles (Handle list is already full) +//Hole here +#define AEE_ENOMORE (AEE_EOFFSET + 0x02F) ///< No more items available -- + ///< reached end +#define AEE_ECPUEXCEPTION (AEE_EOFFSET + 0x030) ///< A CPU exception occurred +#define AEE_EREADONLY (AEE_EOFFSET + 0x031) ///< Cannot change read-only + ///< object or parameter ( Parameter is in protected mode) +#define AEE_ERPC (AEE_EOFFSET + 0x200) ///< Error due to fastrpc implementation +#define AEE_EFILE (AEE_EOFFSET + 0x201) ///= 200000 && !defined(__APCS_ADSABI)) || \ + (defined(__GNUC__) && defined(__arm__) && defined(__ARM_EABI__)) + +# define __AEEVA_ATPCS 0 + +#else + +# define __AEEVA_ATPCS 1 + +#endif + +typedef void* AEEVaList; + +#define __AEEVA_ARGALIGN(t) (((char*)(&((struct{char c;t x;}*)1)->x))-((char*)1)) +#define __AEEVA_ARGSIZE(t) ((sizeof(t)+sizeof(int)-1) & ~(sizeof(int)-1)) + +static __inline void __cpy(char*d, const char*s, int len) +{ + while (len-- > 0) *d++ = *s++; +} + +static __inline AEEVaList __AEEVa_Arg(AEEVaList args, void* pv, int nVSize, + int nArgSize, int nArgAlign) +{ + int nArgs = (int)args & ~1; + char* pcArgs = (char*)args; + int bATPCS = (int)args & 1; + int nArgsOffset = 0; + int nVOffset = 0; + + if (!bATPCS) { /* caller was compiled with AAPCS */ + + if (nArgAlign > (int)sizeof(int)) { + nArgAlign--; /* make a mask */ + pcArgs += ((nArgs + nArgAlign) & (int)~(unsigned)nArgAlign) - nArgs; + /* move pv to next alignment */ + } + } + +#if defined(AEE_BIGENDIAN) + if (nArgSize < (int)sizeof(int)) { + nArgsOffset = (int)sizeof(int) - nArgSize; + } + nVOffset = nVSize - nArgSize; +#else + (void)nVSize; +#endif /* AEE_BIGENDIAN */ + + __cpy((char*)pv + nVOffset, (pcArgs - bATPCS) + nArgsOffset, nArgSize); + + /* round up */ + nArgSize = (nArgSize+(int)sizeof(int)-1) & ~((int)sizeof(int)-1); + + return pcArgs + nArgSize; /* increment va */ +} + +#define AEEVA_START(va,v) ((va) = (char*)&(v) + __AEEVA_ARGSIZE(v) + __AEEVA_ATPCS) +#define AEEVA_ARG(va,v,t) ((void)((va) = __AEEVa_Arg(va,&v,sizeof(v),sizeof(t),__AEEVA_ARGALIGN(t)))) +#define AEEVA_END(va) ((va) = (AEEVaList)0) +#define AEEVA_COPY(dest, src) ((void)((dest) = (src))) + +#else /* !defined(__clang__) && (defined(__ARMCC_VERSION) || (defined(__GNUC__) && defined(__arm__))) */ + +#include + +typedef va_list AEEVaList; + +#define AEEVA_START(va,v) (va_start((va), (v))) +#define AEEVA_ARG(va,v,t) ((v) = va_arg((va),t)) +#define AEEVA_END(va) (va_end((va))) +#define AEEVA_COPY(dest, src) (va_copy((dest),(src))) + +#endif/* !defined(__clang__) && (defined(__ARMCC_VERSION) || (defined(__GNUC__) && defined(__arm__))) */ + +#endif /* #ifndef AEEVALIST_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.h new file mode 100755 index 0000000000000..94bcba0a701a4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.h @@ -0,0 +1,695 @@ +/* + * $Header: //components/rel/core.qdsp6/8.2/api/common/kernel/libstd/stringl/stringl.h#1 $ + * $DateTime: 2023/05/10 09:48:16 $ + */ + +/* $OpenBSD: string.h,v 1.17 2006/01/06 18:53:04 millert Exp $ */ +/* $NetBSD: string.h,v 1.6 1994/10/26 00:56:30 cgd Exp $ */ + +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)string.h 5.10 (Berkeley) 3/9/91 + */ + +#ifndef _STRINGL_H_ +#define _STRINGL_H_ + +#include +#include +#include + +/** @defgroup error_codes Error Codes + * @{ + */ +// +// AEEstd.h header error codes +// +#ifndef STD_NODIGITS + #define STD_NODIGITS 1 /**< See std_scanul(). */ +#endif + +#ifndef STD_NEGATIVE + #define STD_NEGATIVE 2 /**< See std_scanul(). */ +#endif + +#ifndef STD_OVERFLOW + #define STD_OVERFLOW 3 /**< See std_scanul(). */ +#endif + +#ifndef STD_BADPARAM + #define STD_BADPARAM 4 /**< See std_scanul(). */ +#endif + +/** + * @} + */ + +/**< UTF-16 2-byte wide char type */ +typedef unsigned short wchar; + +#ifdef __cplusplus +namespace std +{ + extern "C" + { +#endif //__cplusplus + +/** +Added these macros for supporting compilation on Win based +software dev environments like VC, .Net etc. +*/ +#ifdef _WIN32 + #define snprintf _snprintf + #define vsnprintf _vsnprintf +#endif + +/** @defgroup str_apis String Operation APIs + * @{ + */ + +/** + strlcat - Size bounded string concatenation. + + Concatenates the source string to destination string. + + This function ensures that the destination string will + not be improperly terminated and that there will be + no concatenation beyond the size of the destination buffer. + + @param[in,out] dst Destination buffer. + @param[in] src Source string. + @param[in] siz Size of the destination buffer in bytes. + + @return + The length of the string that was attempted to be created, + i.e. the sum of the source and destination strings. + + @dependencies + None. +*/ +size_t strlcat(char *dst, const char *src, size_t siz); + +/** + * @} + */ + + /** @defgroup wstr_apis Wide Char String Operation APIs + * @{ + */ +/** + wcslcat - Size bounded wide string concatenation using + C standard wide character data type wchar_t. + + Concatenates the source string to destination string. + + This function ensures that the destination string will + not be improperly terminated and that there will be + no concatenation beyond the size of the destination buffer. + + @param[in,out] dst Destination buffer. + @param[in] src Source string. + @param[in] siz Size of the destination buffer in units of wchar_t. + + @return + The length of the string that was attempted to be created, + i.e. the sum of the source and destination strings. + + @note It has been observed that wchar_t on some platforms is + 2 bytes wide (UTF-16) and on others is 4 bytes wide (UTF-32). + So carefully consider this when using the data type wchar_t + and this API in your application. + + @dependencies + None. +*/ + +size_t wcslcat(wchar_t *dst, const wchar_t *src, size_t siz); + +/** + wstrlcat - Size bounded wide string concatenation using 2 byte + wide (UTF-16) character data type wchar. + + Concatenates the source string to destination string. + + This function ensures that the destination string will + not be improperly terminated and that there will be + no concatenation beyond the size of the destination buffer. + + @param[in,out] dst Destination buffer. + @param[in] src Source string. + @param[in] siz Size of the destination buffer in units of wchar. + + @return + The length of the string that was attempted to be created, + i.e. the sum of the source and destination strings. + + @dependencies + None. +*/ +size_t wstrlcat(wchar* dst, const wchar* src, size_t siz); + +/** + * @} + */ + + /** @addtogroup str_apis + @{ */ + +/** + strlcpy - Size bounded string copy. + + Copies the source string to the destination buffer. + + This function ensures that the destination buffer will always + be NULL terminated and that there will not be a copy beyond + the size of the destination buffer. + + @param[out] dst Destination buffer. + @param[in] src Source String. + @param[in] siz Size of the destination buffer in bytes. + + @return + The length of the source string. + + @dependencies + None. +*/ +size_t strlcpy(char *dst, const char *src, size_t siz); + +/** @} */ + + /** @addtogroup wstr_apis + @{ */ +/** + wcslcpy - Size bounded wide string copy using + C standard wide character data type wchar_t. + + Copies the source string to the destination buffer. + + This function ensures that the destination buffer will always + be NULL terminated and that there will not be a copy beyond + the size of the destination buffer. + + @param[out] dst Destination buffer. + @param[in] src Source String. + @param[in] siz Size of the destination buffer in units of wchar_t. + + @return + The length of the source string. + + @note It has been observed that wchar_t on some platforms is + 2 bytes wide (UTF-16) and on others is 4 bytes wide (UTF-32). + So carefully consider this when using the data type wchar_t + and this API in your application. + + @dependencies + None. +*/ + +size_t wcslcpy(wchar_t *dst, const wchar_t *src, size_t siz); + +/** + wstrlcpy - Size bounded wide string copy using 2 byte + wide (UTF-16) character data type wchar. + + Copies the source string to the destination buffer. + + This function ensures that the destination buffer will always + be NULL terminated and that there will not be a copy beyond + the size of the destination buffer. + + @param[out] dst Destination buffer. + @param[in] src Source String. + @param[in] siz Size of the destination buffer in units of wchar. + + @return + The length of the source string. + + @dependencies + None. +*/ +size_t wstrlcpy(wchar* dst, const wchar* src, size_t siz); + +/** + wstrlen - Returns the number of characters in the source string. + Used for strings based on wchar data type i.e. 2 byte wide (UTF-16) + characters. + + @param[in] src Source String. + + @return + The number of characters in the source string. + + @dependencies + None. +*/ +size_t wstrlen(const wchar *src); + +/** + wstrcmp - Compares wchar (UTF-16) string s1 to the wchar string s2. + + This function starts comparing the first character of each string. + If they are equal to each other, it continues with the following + pairs until the characters differ or until a terminating + null-character is reached. + + @param[in] s1 String to be compared. + @param[in] s2 String to be compared against. + + @return + 0 - Indicates that the strings are equal. + >0 - Indicates that the strings are not equal and a character in s1 is + greater than the corresponding character in s2. + <0 - Indicates that the strings are not equal and a character in s1 is + lesser than the corresponding character in s2. + + @dependencies + None. +*/ +int wstrcmp(const wchar *s1, const wchar *s2); + +/** + wstrncmp - Compares upto n wchar (UTF-16) characters in string s1 + to the wchar string s2. + + This function starts comparing the first character of each string. + If they are equal to each other, it continues with the following + pairs until the characters differ or until a terminating + null-character is reached or n comparisons have been performed. + + @param[in] s1 String to be compared. + @param[in] s2 String to be compared against. + @param[in] n Nmber of character to be compared. + + @return + 0 - Indicates that the strings are equal. + >0 - Indicates that the strings are not equal and a character in s1 is + greater than the corresponding character in s2. + <0 - Indicates that the strings are not equal and a character in s1 is + lesser than the corresponding character in s2. + + @dependencies + None. +*/ +int wstrncmp(const wchar *s1, const wchar *s2, size_t n); + +/** @} */ + + /** @addtogroup str_apis + @{ */ + +/** + strcasecmp - compare two strings ignoring case. + + @param[in] s1 First string. + @param[in] s2 Second string. + + @return + The strcasecmp() and strncasecmp() functions return an integer + less than, equal to, or greater than zero if s1 (or the first + n bytes thereof) is found, respectively, to be less than, to + match, or be greater than s2. + + @dependencies + None. +*/ +int strcasecmp(const char * s1, const char * s2); + +/** + strncasecmp - compare two strings ignoring case (sized). + + @param[in] s1 First string. + @param[in] s2 Second string. + @param[in] n The number of characters to compare (from the + beginning). + + @return + The strcasecmp() and strncasecmp() functions return an integer + less than, equal to, or greater than zero if s1 (or the first + n bytes thereof) is found, respectively, to be less than, to + match, or be greater than s2. + + @dependencies + None. +*/ +int strncasecmp(const char * s1, const char * s2, size_t n); + +/** +std_scanul() + +Description: + + The std_scanul() converts an ASCII representation of a number + to an unsigned long. It expects strings that match the + following pattern: + + spaces [+|-] digits + + + 'Spaces' is zero or more ASCII space or tab characters. + + 'Digits' is any number of digits valid in the radix. Letters + 'A' through 'Z' are treated as digits with values 10 through + 35. 'Digits' may begin with "0x" when a radix of 0 or 16 is + specified. + + Upper and lower case letters can be used interchangeably. + + @param[in] pchBuf The start of the string to scan. + + @param[in] nRadix The numeric radix (or base) of the + number. Valid values are 2 through 36 or zero, + which implies auto-detection. Auto-detection + examines the digits field. If it begins with + "0x", radix 16 is selected. Otherwise, if it + begins with "0" radix 8 is selected. + Otherwise, radix 10 is selected. + + @param[out] ppchEnd If ppchEnd is not NULL, *ppchEnd + points to the first character that did not + match the expected pattern shown above, + except on STD_BADPARAM and STD_OVERFLOW when + it is set to the start of the string. + + @param[out] pnError If pnError is not NULL, *pnError + holds the error code, which is one of the + following: + + 0 : Numeric value is from 0 to + MAX_UINT32. + + STD_NEGATIVE : The scanned value was negative and its absolute value was + from 1 to MAX_UINT32. The result is the negated value + (cast to a uint32). + + STD_NODIGITS : No digits were found. The result is zero. + + STD_OVERFLOW : The absolute value exceeded MAX_UINT32. The result + is set to MAX_UINT32 and *ppchEnd is set to pchBuf. + + STD_BADPARAM : An improper value for nRadix was received. The result + is set to zero, and *ppchEnd is set to pchBuf. + + @return + The converted numeric result. + + @dependencies + None. + +*/ +unsigned int std_scanul(const char * pchBuf, int nRadix, const char ** ppchEnd, int *pnError); + +/** @} */ + +/** @defgroup mem_ops Memory Operation APIs + * @{ + */ + +/** + memscpy - Size bounded memory copy. + + Copies bytes from the source buffer to the destination buffer. + + This function ensures that there will not be a copy beyond + the size of the destination buffer. + + The result of calling this on overlapping source and destination + buffers is undefined. + + @param[out] dst Destination buffer. + @param[in] dst_size Size of the destination buffer in bytes. + @param[in] src Source buffer. + @param[in] src_size Number of bytes to copy from source buffer. + + @return + The number of bytes copied to the destination buffer. It is the + caller's responsibility to check for trunction if it cares about it - + truncation has occurred if the return value is less than src_size. + + @dependencies + None. +*/ + +size_t memscpy(void *dst, size_t dst_size, const void *src, size_t src_size); + +/** + memscpy_i - Inline function for size bounded memory copy. + + @see memscpy() +*/ + +static __inline size_t memscpy_i +( + void *dst, + size_t dst_size, + const void *src, + size_t src_size +) +{ + size_t copy_size = (dst_size <= src_size)? dst_size : src_size; + + memcpy(dst, src, copy_size); + + return copy_size; +} + +/** + memsmove - Size bounded memory move. + + Moves bytes from the source buffer to the destination buffer. + + This function ensures that there will not be a copy beyond + the size of the destination buffer. + + This function should be used in preference to memscpy() if there + is the possiblity of source and destination buffers overlapping. + The result of the operation is defined to be as if the copy were from + the source to a temporary buffer that overlaps neither source nor + destination, followed by a copy from that temporary buffer to the + destination. + + @param[out] dst Destination buffer. + @param[in] dst_size Size of the destination buffer in bytes. + @param[in] src Source buffer. + @param[in] src_size Number of bytes to copy from source buffer. + + @return + The number of bytes copied to the destination buffer. It is the + caller's responsibility to check for trunction if it cares about it - + truncation has occurred if the return value is less than src_size. + + @dependencies + None. +*/ + +size_t memsmove(void *dst, size_t dst_size, const void *src, size_t src_size); + +/** + memsmove_i - Inline function for size bounded memory move. + + @see memsmove() +*/ + +static __inline size_t memsmove_i +( + void *dst, + size_t dst_size, + const void *src, + size_t src_size +) +{ + size_t copy_size = (dst_size <= src_size)? dst_size : src_size; + + memmove(dst, src, copy_size); + + return copy_size; +} + +/** + secure_memset - Memset functionality that won't be optimized by the compiler + + Memsets a memory location to a given value in a way that is unlikely to be + removed by the compiler + + A classic compiler optimization is to remove references to instructions that + assign a value to a variable where that variable is never used after the + assignment. However, this means that compilers will often remove memset + instructions which are used to "zero" sensitive information in stack or heap + memory, which can cause a security risk. This function performs a basic + memset operation, but should always be instantiated in its own file, this + will mean file optimizers will not be able to optimize its use and linkers + do not have sufficient intelligence to optimize calls between files. + + This function should be used when clearing sensitive information in memory. + + @param[in] ptr Points to the memory area to be set. + @param[in] value The value to be set. + @param[in] len The number of bytes to be set. + + @return + This function returns the pointer to the memory area ptr. + + @dependencies + None. +*/ + +void* secure_memset(void* ptr, int value, size_t len); + +/** + * @} + */ + + +/** @defgroup time_safe_ops Time Safe Memory Operation APIs + * @{ + */ + +/** + timesafe_memcmp - Constant-time memory comparison + + Compares bytes at two different sections in memory in constant time + + This function compares the len bytes starting at ptr1 with the len + bytes starting at ptr2. The function returns 1 if the two sections + of memory are different and 0 if the two sections of memory are + identical. The function always scans the entire range of memory to + ensure the function runs in constant time. + + This function should be used when comparing confidential information + in memory as it prevents timing attacks. A traditional memcmp() exits + after finding non-equal bytes and this can be used to determine the value + of confidential data in memory. Examples uses include password checks, + MACs checks, decryption checks, and checks on private user information. + + @param[in] ptr1 Points to the first memory bytes to be compared. + @param[in] ptr2 Points to the second memory bytes to be compared. + @param[in] len The number of bytes to be compared. + + @return + This function returns 1 if the two buffers are different and + 0 if the two buffers are identical. + + @dependencies + None. +*/ + +int timesafe_memcmp(const void* ptr1, const void* ptr2, size_t len); + +/** + timesafe_strncmp - Constant-time string comparison + + Compares bytes in two different string buffers in constant time + + This function compares the contents of the string buffer at ptr1 with + the string buffer at ptr2 up to a maximum of len bytes. The function + does not compare bytes beyond the first occurrence of a NULL byte. The + function returns 1 if the two strings are different and 0 if the strings + are identical. The function always scans the entire range of memory to + ensure the function runs in constant time. + + This function shuld be used when comparing strings that contain confidential + information as it prevents timing attacks. A traditional strncmp() exits + after finding non-equal bytes or a NULL byte and this can be used to + determine the value of confidential data in memory. + + @param[in] ptr1 Points to the first string to be compared. + @param[in] ptr2 Points to the second string to be compared. + @param[in] len The number of bytes to be compared. + + @return + This function returns 1 if the strings are different and + 0 if the strings are the same. + + @dependencies + None. +*/ + +int timesafe_strncmp(const char* ptr1, const char* ptr2, size_t len); +/** + * @} + */ + + /** @addtogroup str_apis + @{ */ +/** + strnlen - Determine the length of a fixed size string + + This function takes a maxlen length parameter and stops looking for a NULL + character if it passes the maxlen length. It is considered safer than + strlen because it will not enter an endless loop if the source string + is a bad string without NULL termination. + + @param[in] str Points to the source string. + @param[in] maxlen The maximum number of bytes to count. + + @return + This function returns the number of bytes in string pointed to by str, + excluding the terminating NULL byte ('\0'), but at most maxlen. + + @dependencies + None. +*/ +#ifndef _WIN32 +size_t strnlen(const char *str, size_t maxlen); +#endif + +/** + * @} + */ +#ifdef __cplusplus + } //extern "C" +} //namespace std +#endif //__cplusplus + +//Explicit export of the libstd implemented functions +#ifdef __cplusplus +#ifdef _WIN32 + using std::strlcat; + using std::strlcpy; + using std::strcasecmp; + using std::strncasecmp; +#endif + using std::wcslcat; + using std::wstrlcat; + using std::wcslcpy; + using std::wstrlcpy; + using std::wstrcmp; + using std::wstrncmp; + using std::wstrlen; + using std::memscpy; + using std::memsmove; + using std::secure_memset; + using std::timesafe_memcmp; + using std::timesafe_strncmp; +#ifndef _WIN32 + using std::strnlen; +#endif +#endif //__cplusplus + +#endif /* _STRINGL_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.md new file mode 100755 index 0000000000000..a80878d828e28 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.md @@ -0,0 +1,51 @@ +# Prototypes for string manipulation functions + +## Introduction {#introduction} + +`stringl.h` contains function prototypes for various string manipulation functions. These are considered safer than the standard C string manipulation functions and were developed as part of the [OpenBSD project](https://www.openbsd.org/). + +## API Overview {#api-overview} + +The stringl.h APIs include the following functions: + +* ::strlcat + +* ::wcslcat + +* ::wstrlcat + +* ::strlcpy + +* ::wcslcpy + +* ::wstrlcpy + +* ::wstrlen + +* ::wstrcmp + +* ::wstrncmp + +* ::strcasecmp + +* ::strncasecmp + +* ::std_scanul + +* ::memscpy + +* ::memscpy_i + +* ::memsmove + +* ::memsmove_i + +* ::secure_memset + +* ::timesafe_memcmp + +* ::timesafe_strncmp + +* ::strnlen + +Header file: @b stringl.h diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/synx.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/synx.md new file mode 100755 index 0000000000000..21bc9a9a18386 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/synx.md @@ -0,0 +1,7 @@ +## Overview + +Heterogeneous Systems involve more than one core to efficiently process a task. Nowadays, there are lot of new advanced use cases that require computation across multiple cores. One example of such use case is for the camera core to capture an image, pass it on to DSP and/or GPU cores for post processing and send the final output to the display subsystem for rendering. Such use cases involve transferring of control points and sharing of buffers across multiple cores. This type of application use case drives the need for a generic synchronization framework which explicitly describes dependencies between different asynchronous operations across the SoC. + +The synx framework helps to capture such dependencies across cores. It notifies task completions and /or buffer ready information between a producer and consumers. + +***Note***: There are many synchronization/fence mechanisms available today but those work best within a single core/device. In Android systems, buffers are usually allocated as ION buffers so that these buffer can be shared across various components (UMD, KMD, and HW). If another core needs to access such buffers, we need to synchronously transfer control (along with data payload) from one core to the other. This explicit transfer of control from one core to the other eliminates the possibilities of optimization and hence drives the idea of introducing synx handle for synchronization. diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.h new file mode 100755 index 0000000000000..e46e7d7f25a29 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.h @@ -0,0 +1,109 @@ +/*----------------------------------------------------------------------------- + Copyright (c) 2017-2020 QUALCOMM Technologies, Incorporated. + All Rights Reserved. + QUALCOMM Proprietary. +-----------------------------------------------------------------------------*/ + +#ifndef SYSMON_CACHELOCK_H_ +#define SYSMON_CACHELOCK_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file sysmon_cachelock.h + * @brief CDSP L2 cache locking manager API + */ + +/** + * Allocates a memory buffer, locks it in L2 cache, and returns the locked + * virtual address. + * + * @param[in] size Memory size (in bytes) to lock. + * @param[out] paddr_ptr Pointer to @c unsigned @c long @c long + * variable to get the locked 64-bit physical address upon + * success. NULL if the allocation and cache lock failed. + * + * @return + * @c void* Virtual address of the locked memory region. \n + * 0 if the requested buffer size could not be allocated and locked in the L2 cache. + */ +void* HAP_cache_lock(unsigned int size, unsigned long long *paddr_ptr); + + +/** + * Unlocks cache and deallocates memory for a virtual address returned by + * the corresponding HAP_cache_lock() call. + * + * @param[in] vaddr_ptr Virtual address of the memory block to unlock. + * + * @return + * 0 upon success. \n + * Other values upon failure. + */ +int HAP_cache_unlock(void* vaddr_ptr); + +/** + * Locks the cache for a given virtual address and memory size (in bytes). + * + * Align the address and size to 32 bytes. The size should not be more + * than 64 KB, and at any point of time, only one such request is honored + * (this restriction has been removed from SM8250 onwards). + * + * Use this function to lock an existing memory block, for example, + * to lock a code segment or data buffer. Note that whenever possible, it is + * preferable to let the driver allocate the memory to be locked in L2 via the + * HAP_cache_lock API, as it can often avoid the fragmentation likely to occur + * when the user provides the memory ranges to be locked. + * + * @param[in] vaddr_ptr Virtual address of the memory block to lock; should be + * 32-byte aligned. + * @param[in] size Memory size (in bytes) to lock; should be 32-byte aligned. + * The maximum size limit is 64 KB. From SM8250, this size limit is + * the same as HAP_cache_lock(). + * + * @return + * 0 upon success. \n + * Others values upon failure. + */ +int HAP_cache_lock_addr(void* vaddr_ptr, unsigned int size); + +/** + * Unlocks the cache for a given virtual address. + * + * Use this function together with HAP_cache_lock_addr(). + * + * @param[in] vaddr_ptr Virtual address of the memory block to unlock. + * + * @return + * 0 upon success. \n + * Other values upon failure. + */ +int HAP_cache_unlock_addr(void* vaddr_ptr); + +/** + * Queries the API to get the size of largest contiguous memory block available for + * cache locking. + * + * @return + * Available size in bytes upon success. \n + * -1 upon failure. + */ +int HAP_query_avail_cachelock(void); + +/** + * Queries the API to get the total locked cache size. + * + * @return + * Total locked cache size in bytes upon success. \n + * -1 upon failure. + */ +int HAP_query_total_cachelock(void); + + +#ifdef __cplusplus +} +#endif + +#endif /* SYSMON_CACHELOCK_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.md new file mode 100755 index 0000000000000..44efd95724811 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.md @@ -0,0 +1,25 @@ +# Cache locking manager + +The cache locking manager locks a section of the L2 cache from the +cDSP, and subsequently releases this lock. + +The cache locking manager replaces the HAP_power_set APIs that are now deprecated. +This new cache locking manager utilizes available L2 cache by +allocating memory with appropriately aligned address based on L2 cache +availability and the request size. The cache locking manager also limits +maximum cache that can be locked to guarantee performance of the guest OS and +FastRPC threads. + +The cache locking manager monitors cache locking usage by providing +APIs to get the maximum available cache size for locking and the total +currently locked cache. + +Finally, a set of APIs passes the address of the memory to lock +along with its size information. These APIs are useful for applications where a +linker-defined section (code/library) must be locked into cache. + +The cache locking manager APIs are not accessible from unsigned PD. + +## Framework APIs + +Header file: @b sysmon_cachelock.h diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.h new file mode 100755 index 0000000000000..bcf34d1f80059 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.h @@ -0,0 +1,46 @@ +/*----------------------------------------------------------------------- + Copyright (c) 2017-2020 QUALCOMM Technologies, Incorporated. + All Rights Reserved. + QUALCOMM Proprietary. +-----------------------------------------------------------------------*/ +#ifndef SYSMON_MARKER_H +#define SYSMON_MARKER_H + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file sysmon_marker.h + * @brief Sysmon profiling marker API + * Allows the user to profile a piece of code or + * algorithm of interest. + */ + +/** + * Enables or disables a profiling marker. + + * @param[in] marker Any unique, customer-defined, unsigned number to identify profiling + data mapped to a section of code. + * @param[in] enable Flag to enable (1) or disable (1) the profiling marker. + * + * For example: + * @code + * #include + * // or, alternatively, + * // extern HP_profile(unsigned int marker, unsigned char enable); + * + * HP_profile(10, 1); + * // ... + * // User code to profile + * // ... + * HP_profile(10, 0); + * @endcode + */ +void HP_profile(unsigned int marker, unsigned char enable); + +#ifdef __cplusplus +} +#endif + +#endif /*SYSMON_MARKER_H*/ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.md new file mode 100755 index 0000000000000..d25fbdd30f102 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.md @@ -0,0 +1,36 @@ +# sysMon marker + +The sysMon marker API profiles a specific code region to study its own load on the processor compute resources and the bus bandwidth, and captures various other profiling metrics associated to that specific code region. +This approach is useful when measuring performance, debugging performance-related issues or identifying possible optimizations for a specific code region instead of profiling the entire application. + +This API is not supported in unsigned PD and CPZ. + +## Collect profiling data + +Once the code has been instrumented with the sysMon marker APIs to enable and disable profiling of specific code regions, the [sysMonApp profiler](../../tools/sysmon_app.html#profiler-service) +must run to collect sysMon marker data. + +If DCVS is enabled by the user, the decisions taken by DCVS with profiling markers enabled might not be the same as without markers. + +## Parsing profiling data with sysMon marker + +Profiling data captured using the [sysMonApp profiler](../../tools/sysmon_app.html#profiler-service) can be parsed using the [sysMon parser](../../tools/sysmon_parser.html). +Refer [STID and markers data](../../tools/sysmon_parser.html#stid-and-markers-data) section for the output files generated by the sysMon parser when sysMon markers are enabled. + +## Limitations + +* Nested markers are not supported + + For example, the following piece of code does not produce the expected profiling data; its behavior is undefined. + + HP_profile(10, 1); + // ... user code ... + HP_profile(11, 1); + // ... user code ... + HP_profile(11, 0); + // ... user code ... + HP_profile(10, 0); + +* Enabling profiling markers forces collection of profiling data on all hardware threads. + + Profiling statistics are collected for any entity running in parallel with the piece of code where markers are defined. diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/utils.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/utils.h new file mode 100755 index 0000000000000..57a5dd96b2930 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/utils.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2020 QUALCOMM Technologies Inc. All Rights Reserved. + * Qualcomm Technologies Confidential and Proprietary + * + */ + +#ifdef __cplusplus +extern "C" { +#endif +size_t memscpy(void* dst, size_t dst_size, const void* src, size_t src_size); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/verify.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/verify.h new file mode 100755 index 0000000000000..dea9f22172089 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/verify.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2012-2013, 2020 QUALCOMM Technologies Inc. + * All Rights Reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef VERIFY_H +#define VERIFY_H + + +#ifndef _WIN32 +#define C_ASSERT(test) \ + switch(0) {\ + case 0:\ + case test:;\ + } +#endif // _WIN32 + +#ifndef __V_STR__ + #define __V_STR__(x) #x ":" +#endif //__STR__ +#ifndef __V_TOSTR__ + #define __V_TOSTR__(x) __V_STR__(x) +#endif // __TOSTR__ +#ifndef __V_FILE_LINE__ + #define __V_FILE_LINE__ __FILE__ ":" __V_TOSTR__(__LINE__) +#endif /*__FILE_LINE__*/ + + +#ifdef __ANDROID__ +/*android */ +#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR) +#include +#endif + +#ifdef VERIFY_PRINT_INFO +#define VERIFY_IPRINTF(format, ...) __android_log_print(ANDROID_LOG_DEBUG , "adsprpc", __V_FILE_LINE__ format, ##__VA_ARGS__) +#endif + +#ifdef VERIFY_PRINT_ERROR +#define VERIFY_EPRINTF(format, ...) __android_log_print(ANDROID_LOG_ERROR , "adsprpc", __V_FILE_LINE__ format, ##__VA_ARGS__) +#endif + +/* end android */ +#elif (defined __hexagon__) || (defined __qdsp6__) +/* q6 */ + +#ifdef VERIFY_PRINT_INFO + #define FARF_VERIFY_LOW 1 + #define FARF_VERIFY_LOW_LEVEL HAP_LEVEL_LOW + #define VERIFY_IPRINTF(args...) FARF(VERIFY_LOW, args) +#endif + +#ifdef VERIFY_PRINT_ERROR + #define FARF_VERIFY_ERROR 1 + #define FARF_VERIFY_ERROR_LEVEL HAP_LEVEL_ERROR + #define VERIFY_EPRINTF(args...) FARF(VERIFY_ERROR, args) +#endif + +#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR) + #include "HAP_farf.h" +#endif + +/* end q6 */ +#elif (defined USE_SYSLOG) +/* syslog */ +#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR) +#include +#endif + +#ifdef VERIFY_PRINT_INFO +#define VERIFY_IPRINTF(format, ...) syslog(LOG_USER|LOG_INFO, __V_FILE_LINE__ format, ##__VA_ARGS__) +#endif + +#ifdef VERIFY_PRINT_ERROR +#define VERIFY_EPRINTF(format, ...) syslog(LOG_USER|LOG_ERR, __V_FILE_LINE__ format, ##__VA_ARGS__) +#endif + +/* end syslog */ +#else +/* generic */ + +#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR) +#include +#endif + +#ifdef VERIFY_PRINT_INFO +#define VERIFY_IPRINTF(format, ...) printf(__V_FILE_LINE__ format "\n", ##__VA_ARGS__) +#endif + +#ifdef VERIFY_PRINT_ERROR +#define VERIFY_EPRINTF(format, ...) printf(__V_FILE_LINE__ format "\n", ##__VA_ARGS__) +#endif + +/* end generic */ +#endif + +#ifndef VERIFY_PRINT_INFO +#define VERIFY_IPRINTF(format, ...) (void)0 +#endif + +#ifndef VERIFY_PRINT_ERROR +#define VERIFY_EPRINTF(format, ...) (void)0 +#endif + +#ifndef VERIFYC + #define VERIFYC(val,err_code) \ + do {\ + VERIFY_IPRINTF(":info: calling: %s", #val);\ + if(0 == (val)) {\ + nErr = err_code;\ + VERIFY_EPRINTF(":error: %d: %s", nErr, #val);\ + goto bail;\ + } else {\ + VERIFY_IPRINTF(":info: passed: %s", #val);\ + }\ + } while(0) +#endif //VERIFYC + +#ifndef VERIFY + #define VERIFY(val) \ + do {\ + VERIFY_IPRINTF(":info: calling: %s", #val);\ + if(0 == (val)) {\ + nErr = nErr == 0 ? -1 : nErr;\ + VERIFY_EPRINTF(":error: %d: %s", nErr, #val);\ + goto bail;\ + } else {\ + VERIFY_IPRINTF(":info: passed: %s", #val);\ + }\ + } while(0) +#endif //VERIFY + +#endif //VERIFY_H + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/version.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/version.h new file mode 100755 index 0000000000000..2e3f0ad3278ff --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/version.h @@ -0,0 +1,91 @@ +#ifndef VERSION_H +#define VERSION_H +/*=========================================================================== + +FILE: version.h + +SERVICES: Hexagon Access Program (HAP) SDK version_string + +GENERAL DESCRIPTION: + Definitions for versioning + + Copyright © 2012 QUALCOMM Incorporated. + All Rights Reserved. + QUALCOMM Proprietary/GTDR +===========================================================================*/ + +#define VERSION_MAJOR 6 +#define VERSION_MINOR 2 +#define VERSION_MAINT 0 +#define VERSION_BUILD 1 + +#define VERSION_STRING "HAP SDK 6.2.0.1 (srvr=qtcp406;br=main;cl=1242374)" + + +/* +======================================================================= +MACROS DOCUMENTATION +======================================================================= + +VERSION_MAJOR + +Description: + Defines the major release number + +Comments: + It has to be a valid numerical value +======================================================================= + +VERSION_MINOR + +Description: + Defines the minor release number + +Comments: + It has to be a valid numerical value +======================================================================= + +VERSION_MAINT + +Description: + Defines the maintenance release + +Comments: + It has to be a valid numerical value +======================================================================= + +VERSION_BUILD + +Description: + Defines the build ID + +Comments: + It has to be a valid numerical value +======================================================================= + +VERSION_STRING + +Description: + Defines the version string + +Definition: + + #define VERSION_STRING "a.b.c.d (name=value;name=value;...)" + where a=major release number + b=minor release number + c=maintenance release number + d=build number + + name=value pair provides additional information about the build. + Example: + patch/feature=comma separated list of features/patches that have been installed. + br=p4 branch that was used for the build + cl=p4 change list number + machine=hostname of the machine that was used for the build. + +Comments: + +======================================================================= +*/ + +#endif // VERSION_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/version_note.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/version_note.h new file mode 100755 index 0000000000000..10e498546cc98 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/version_note.h @@ -0,0 +1,19 @@ +/*============================================================================== + Copyright (c) 2022, 2023 Qualcomm Technologies, Inc. + All rights reserved. Qualcomm Proprietary and Confidential. +==============================================================================*/ + +#ifndef VERSION_NOTE_H +#define VERSION_NOTE_H +#define VERSION_NOTE_LENGTH 100 + + typedef struct { + int sizename; //Size of the NOTE section + int sizedesc; // Size of the descriptor(unused) + int type; // Type of section(unused)//stores version and library name + char name[VERSION_NOTE_LENGTH]; // Name of NOTE section(version of shared object) + int desc[3]; // used for labeling note segment version (lib.ver.V1.V2.V3) + } lib_ver_note_t; + +#endif //VERSION_NOTE_H + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/incs/mmdefs.h b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/incs/mmdefs.h new file mode 100755 index 0000000000000..3fdc11dcb810e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/incs/mmdefs.h @@ -0,0 +1,48 @@ +#ifndef _MMDEFS_H +#define _MMDEFS_H +/*============================================================================== + Copyright (c) 2012-2013 Qualcomm Technologies Incorporated. + All Rights Reserved Qualcomm Technologies Proprietary + + Export of this technology or software is regulated by the U.S. + Government. Diversion contrary to U.S. law prohibited. +==============================================================================*/ + +/*------------------------------------------------------------------------------ + Standard Integer Types +------------------------------------------------------------------------------*/ + +#include "stdint.h" + +/*------------------------------------------------------------------------------ + Constants +------------------------------------------------------------------------------*/ + +#undef TRUE +#undef FALSE + +#define TRUE (1) /* Boolean true value */ +#define FALSE (0) /* Boolean false value */ + +#ifndef NULL + #define NULL (0) +#endif + +/*------------------------------------------------------------------------------ + Character and boolean +------------------------------------------------------------------------------*/ + +typedef char char_t; /* Character type */ +typedef unsigned char bool_t; /* Boolean value type */ + +/*============================================================================== + FUNCTION : align_to_8_byte + DESCRIPTION: Ceil to the next multiple of 8 +==============================================================================*/ +static inline uint32_t align_to_8_byte(const uint32_t num) +{ + return ((num + 7) & (0xFFFFFFF8)); +} + +#endif /* _MMDEFS_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc.so new file mode 100755 index 0000000000000..572796d03718b Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc.so differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc_system.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc_system.so new file mode 100755 index 0000000000000..e6b5daae76dbb Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc_system.so differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so new file mode 100755 index 0000000000000..266cb63c8ded7 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc_system.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc_system.so new file mode 100755 index 0000000000000..c9a05c047d815 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc_system.so differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc.so new file mode 100755 index 0000000000000..78fa7b7e84b9d Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc.so differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc_system.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc_system.so new file mode 100755 index 0000000000000..3b9783f2bc1d3 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc_system.so differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc.so new file mode 100755 index 0000000000000..d7e3d46f9b141 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc.so differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc_system.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc_system.so new file mode 100755 index 0000000000000..588ed0b72f2fb Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc_system.so differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/inc/rpcmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/inc/rpcmem.h new file mode 100755 index 0000000000000..281890ef5fbf0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/inc/rpcmem.h @@ -0,0 +1,248 @@ +/*============================================================================== + Copyright (c) 2012-2013, 2020 Qualcomm Technologies, Inc. + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +==============================================================================*/ + +#ifndef RPCMEM_H +#define RPCMEM_H + +#include "AEEStdDef.h" +#include "stddef.h" + +/** + * @file rpcmem.h + * @brief APIs used to manage memory allocated by the application processor and shared with the DSP. + */ + +/** @defgroup rpcmem_const RPCMEM API macros and enumerations + * @{ + */ + +/** + * Allocate memory with the same properties as the ION_FLAG_CACHED flag. + */ +#ifdef ION_FLAG_CACHED +#define RPCMEM_DEFAULT_FLAGS ION_FLAG_CACHED +#else +#define RPCMEM_DEFAULT_FLAGS 1 +#endif + +/** + * The FastRPC library tries to map buffers allocated with this flag to the remote process of all current and new + * FastRPC sessions. In case of failure to map, the FastRPC library ignores the error and continues to open the session + * without pre-mapping the buffer. In case of success, buffers allocated with this flag will be pre-mapped to reduce + * the latency of upcoming FastRPC calls. This flag is recommended only for buffers that are used with latency-critical + * FastRPC methods. Pre-mapped buffers will be unmapped during either buffer free or session close. + */ +#define RPCMEM_TRY_MAP_STATIC 0x04000000 + +/** + * Supported RPCMEM heap IDs. + * + * If you are not using any of the RPCMEM-defined heap IDs, + * you are responsible for ensuring that you are passing + * a valid ION heap ID. + */ +enum rpc_heap_ids { +/** + * Memory for secure use cases only. + * * Secure heap is to be used only by clients migrating to CPZ + */ + RPCMEM_HEAP_ID_SECURE = 9, +/** + * Contiguous physical memory: + * * Very limited memory is available (< 8 MB) + * * Recommended for subsystems without SMMU (sDSP and mDSP) + * * Contiguous heap memory will be deprecated from archs after v73 + */ + RPCMEM_HEAP_ID_CONTIG = 22, +/** + * Non-contiguous system physical memory. + * * Recommended for all use cases that do not require using a specific heap + * * Used with subsystems with SMMU (cDSP and aDSP) + */ + RPCMEM_HEAP_ID_SYSTEM = 25, + }; + +/** + * Use uncached memory. + */ +#define RPCMEM_FLAG_UNCACHED 0 + +/** + * Use cached memory. + */ +#define RPCMEM_FLAG_CACHED RPCMEM_DEFAULT_FLAGS + +/** + * @} + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup rpcmem_api RPCMEM API functions + * @{ + */ + +/** + * Initialize the RPCMEM Library. + * + * Only call this function once before using the RPCMEM Library. + * + * This API is mandatory on pre-Lahaina targets IF the client has linked to the + * rpcmem.a static library. If the client has only linked libadsprpc.so, + * libcdsprpc.so, or libsdsprpc.so, then the rpcmem_init call is not required + * on any target and other rpcmem APIs such as rpcmem_alloc can be called + * directly. + * + * NOTE: This function is not thread safe. + */ +void rpcmem_init(void); + +/** + * Deinitialize the RPCMEM Library. + * + * Only call this function once when the RPCMEM Library is no longer required. + * + * This API is mandatory on pre-Lahaina targets IF the client has linked to the + * rpcmem.a static library. If the client has only linked libadsprpc.so, + * libcdsprpc.so, or libsdsprpc.so, then the rpcmem_deinit call is not required + * on any target. + * + * NOTE: This function is not thread safe. + */ +void rpcmem_deinit(void); + +/** + * Allocate a zero-copy buffer for size upto 2 GB with the FastRPC framework. + * Buffers larger than 2 GB must be allocated with rpcmem_alloc2 + * @param[in] heapid Heap ID to use for memory allocation. + * @param[in] flags ION flags to use for memory allocation. + * @param[in] size Buffer size to allocate. + * @return Pointer to the buffer on success; NULL on failure. + * + * Examples: + * + * * Default memory attributes, 2 KB + * @code + * rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, 2048); + * @endcode + * Or + * @code + * rpcmem_alloc_def(2048); + * @endcode + * + * * Heap 22, uncached, 1 KB + * @code + * rpcmem_alloc(22, 0, 1024); + * @endcode + * Or + * @code + * rpcmem_alloc(22, RPCMEM_FLAG_UNCACHED, 1024); + * @endcode + * + * * Heap 21, cached, 2 KB + * @code + * rpcmem_alloc(21, RPCMEM_FLAG_CACHED, 2048); + * @endcode + * Or + * @code + * #include + * rpcmem_alloc(21, ION_FLAG_CACHED, 2048); + * @endcode + * + * * Default memory attributes but from heap 18, 4 KB + * @code + * rpcmem_alloc(18, RPCMEM_DEFAULT_FLAGS, 4096); + * @endcode + */ +void* rpcmem_alloc(int heapid, uint32 flags, int size); + +/** + * Allocate a zero-copy buffer with the FastRPC framework. + * @param[in] heapid Heap ID to use for memory allocation. + * @param[in] flags ION flags to use for memory allocation. + * @param[in] size Buffer size to allocate. + * @return Pointer to the buffer on success; NULL on failure. + * + * Examples: + * + * * The usage examples are same as rpcmem_alloc. + */ +void* rpcmem_alloc2(int heapid, uint32 flags, size_t size); + +/** + * Allocate a buffer with default settings. + * @param[in] size Size of the buffer to be allocated. + * @return Pointer to the allocated memory buffer. + */ + #if !defined(WINNT) && !defined (_WIN32_WINNT) +__attribute__((unused)) +#endif +static __inline void* rpcmem_alloc_def(int size) { + return rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size); +} + +/** + * Free a buffer and ignore invalid buffers. + */ +void rpcmem_free(void* po); + +/** + * Return an associated file descriptor. + * @param[in] po Data pointer for an RPCMEM-allocated buffer. + * @return Buffer file descriptor. + */ +int rpcmem_to_fd(void* po); + +/** + * @} + */ + +#ifdef __cplusplus +} +#endif + +//! @cond Doxygen_Suppress +/** These macros are deprecated. + */ +#define RPCMEM_DEFAULT_HEAP -1 +#define RPCMEM_HEAP_DEFAULT 0x80000000 +#define RPCMEM_HEAP_NOREG 0x40000000 +#define RPCMEM_HEAP_UNCACHED 0x20000000 +#define RPCMEM_HEAP_NOVA 0x10000000 +#define RPCMEM_HEAP_NONCOHERENT 0x08000000 +#define RPCMEM_FORCE_NOFLUSH 0x01000000 +#define RPCMEM_FORCE_NOINVALIDATE 0x02000000 +// Use macros from libion instead +#define ION_SECURE_FLAGS ((1 << 31) | (1 << 19)) +//! @endcond + +#endif //RPCMEM_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/src/verify.h b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/src/verify.h new file mode 100755 index 0000000000000..71b37828c0646 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/src/verify.h @@ -0,0 +1,164 @@ +/** + * Copyright (c) 2012-2020 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VERIFY_H +#define VERIFY_H + + +//#define VERIFY_PRINT_ERROR +//#define VERIFY_PRINT_INFO + + +#ifndef _WIN32 +#define C_ASSERT(test) \ + switch(0) {\ + case 0:\ + case test:;\ + } +#endif // _WIN32 + +#ifndef __V_STR__ + #define __V_STR__(x) #x ":" +#endif //__STR__ +#ifndef __V_TOSTR__ + #define __V_TOSTR__(x) __V_STR__(x) +#endif // __TOSTR__ +#ifndef __V_FILE_LINE__ + #define __V_FILE_LINE__ __FILE__ ":" __V_TOSTR__(__LINE__) +#endif /*__FILE_LINE__*/ + + +// TODO:sunny - enabled extra prints +#define VERIFY_PRINT_INFO +#define VERIFY_PRINT_ERROR + + +#ifdef __ANDROID__ +/*android */ +#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR) +#include +#endif + +extern const char* __progname; +#ifdef VERIFY_PRINT_INFO +#ifdef __ANDROID__ +#define VERIFY_IPRINTF(format, ...) __android_log_print(ANDROID_LOG_DEBUG , __progname, __V_FILE_LINE__ format, ##__VA_ARGS__) +#else /* !__ANDROID__ */ +#define VERIFY_IPRINTF(format, ...) fprintf(stderr,"%s:%d " _fmt "\n", __func__, __LINE__ , ##__VA_ARGS__) +#endif /* __ANDROID__ */ +#endif + +#ifdef VERIFY_PRINT_ERROR +#ifdef __ANDROID__ +#define VERIFY_EPRINTF(format, ...) __android_log_print(ANDROID_LOG_ERROR , __progname, __V_FILE_LINE__ format, ##__VA_ARGS__) +#else /* !__ANDROID__ */ +#define VERIFY_EPRINTF(format, ...) fprintf(stderr,"%s:%d " _fmt "\n", __func__, __LINE__ , ##__VA_ARGS__) +#endif /* __ANDROID__ */ +#endif + +/* end android */ +#elif (defined __hexagon__) || (defined __qdsp6__) +/* q6 */ + +#ifdef VERIFY_PRINT_INFO + #define FARF_VERIFY_LOW 1 + #define FARF_VERIFY_LOW_LEVEL HAP_LEVEL_LOW + #define VERIFY_IPRINTF(args...) FARF(VERIFY_LOW, args) +#endif + +#ifdef VERIFY_PRINT_ERROR + #define FARF_VERIFY_ERROR 1 + #define FARF_VERIFY_ERROR_LEVEL HAP_LEVEL_ERROR + #define VERIFY_EPRINTF(args...) FARF(VERIFY_ERROR, args) +#endif + +#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR) + #include "HAP_farf.h" +#endif + +/* end q6 */ +#else +/* generic */ + +#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR) +#include +#endif + +#ifdef VERIFY_PRINT_INFO +#define VERIFY_IPRINTF(format, ...) printf(__V_FILE_LINE__ format, ##__VA_ARGS__) +#endif + +#ifdef VERIFY_PRINT_ERROR +#define VERIFY_EPRINTF(format, ...) printf(__V_FILE_LINE__ format, ##__VA_ARGS__) +#endif + +/* end generic */ +#endif + +#ifndef VERIFY_PRINT_INFO +#define VERIFY_IPRINTF(format, ...) (void)0 +#endif + +#ifndef VERIFY_PRINT_ERROR +#define VERIFY_EPRINTF(format, ...) (void)0 +#endif + +#ifndef VERIFY + #define VERIFY(val) \ + do {\ + VERIFY_IPRINTF(":info: calling: " #val "\n");\ + if(0 == (val)) {\ + nErr = nErr == 0 ? -1 : nErr;\ + VERIFY_EPRINTF(":error: %d: " #val "\n", nErr);\ + goto bail;\ + } else {\ + VERIFY_IPRINTF(":info: passed: " #val "\n");\ + }\ + } while(0) +#endif //VERIFY + +#ifndef VERIFYC + #define VERIFYC(val,err_code) \ + do {\ + VERIFY_IPRINTF(":info: calling: " #val "\n");\ + if(0 == (val)) {\ + nErr = err_code;\ + VERIFY_EPRINTF(":Error: %x: " #val "\n", nErr);\ + goto bail;\ + } else {\ + VERIFY_IPRINTF(":info: passed: " #val "\n");\ + }\ + } while(0) +#endif //VERIFYC + + +#endif /* VERIFY_H */ + + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/libs/atomic/inc/AEEatomic.h b/prebuilts/Hexagon_SDK/6.2.0.1/libs/atomic/inc/AEEatomic.h new file mode 100755 index 0000000000000..0b4a7b9cb6be9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/libs/atomic/inc/AEEatomic.h @@ -0,0 +1,173 @@ +#ifndef AEEATOMIC_H +#define AEEATOMIC_H +/* +======================================================================= + +FILE: AEEatomic.h + +SERVICES: atomic + +DESCRIPTION: Fast Atomic ops + +======================================================================= + Copyright 2005, 2007 Qualcomm Technologies Incorporated. + All Rights Reserved. + QUALCOMM Confidential and Proprietary +======================================================================= +*/ + +#include "AEEStdDef.h" + +#ifdef __cplusplus +extern "C" { +#endif /* #ifdef __cplusplus */ + +uint32 atomic_Add(uint32 * volatile puDest, int nAdd); +uint32 atomic_Exchange(uint32 * volatile puDest, uint32 uVal); +uint32 atomic_CompareAndExchange(uint32 * volatile puDest, uint32 uExchange, uint32 uCompare); +uint32 atomic_CompareOrAdd(uint32 * volatile puDest, uint32 uCompare, int nAdd); + +uint64 atomic_CompareAndExchange64(uint64 * volatile puDest, uint64 uExchange, uint64 uCompare); +uintptr_t atomic_CompareAndExchangeUP(uintptr_t * volatile puDest, uintptr_t uExchange, uintptr_t uCompare); +#ifdef __cplusplus +} +#endif /* #ifdef __cplusplus */ + +/*===================================================================== +INTERFACE DOCUMENTATION +======================================================================= +atomic Interface + + The atomic interface provides fast "atomic" operations. The + operations are defined to be atomic with respect to each other. + +======================================================================= + +======================================================================= + +atomic_Add() + +Description: + + Performs an atomic sum operation. + +Prototype: + + uint32 atomic_Add(uint32* puDest, int nInc); + +Parameters: + puDest [in|out] : Points to unsigned number to add nInc and save + nInc : increment + +Return Value: + result. + +Comments: + None + +Side Effects: + None + +See Also: + None + +======================================================================= + +atomic_Exchange() + +Description: + + Atomic exchange of 32bit value. Performs an atomic operation of : + write uVal to *puDest + return the previous value in *puDest + +Prototype: + + uint32 atomic_Exchange(uint32* puDest, uint32 uVal); + +Parameters: + puDest [in|out] : Points to unsigned number to be exchanged + uVal : new value to write. + +Return Value: + previous value at *puDest. + +Comments: + None + +Side Effects: + May cause exception if puDest is not a 32 bit aligned address. + +See Also: + None +======================================================================= + +atomic_CompareAndExchange() + +Description: + + Performs an atomic operation of : + if (*puDest == uCompare) { + *puDest = uExchange; + } + + returns the previous value in *puDest + +Prototype: + + uint32 atomic_CompareAndExchange(uint32 *puDest, uint32 uExchange, + uint32 uCompare); + +Parameters: + puDest [in|out] : Points to unsigned number. + uExchange : A new value to write to *puDest + uCompare : Comparand + +Return Value: + previous value at *puDest. + +Comments: + None + +Side Effects: + May cause exception if puDest is not a 32 bit aligned address. + +See Also: + None + +======================================================================= +atomic_CompareOrAdd() + +Description: + + Performs an atomic operation of : + if (*puDest != uCompare) { + *puDest += nAdd; + } + + returns the new value in *puDest + +Prototype: + + uint32 atomic_CompareOrAdd(uint32 *puDest, uint32 uCompare, int nAdd); + +Parameters: + puDest [in|out] : Points to unsigned number. + uCompare : Comparand + nAdd : Add to *puDest + +Return Value: + new value at *puDest. + +Comments: + None + +Side Effects: + May cause exception if puDest is not a 32 bit aligned address. + +See Also: + None +=======================================================================*/ + +#endif /* #ifndef AEEATOMIC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf.h b/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf.h new file mode 100755 index 0000000000000..cfa7b98d050bc --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf.h @@ -0,0 +1,170 @@ +/**============================================================================= + +@file + qprintf.h + +@brief + API, macros and struct definitions for qprintf utilities available from C. + +Copyright (c) 2017, 2020 QUALCOMM Technologies Incorporated. +All Rights Reserved Qualcomm Proprietary +=============================================================================**/ + +#ifndef qprintf_H +#define qprintf_H + + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "hexagon_types.h" +#include "stdlib.h" + +#ifdef BUILDING_SO +/// MACRO enables function to be visible in shared-library case. +#define qprintf_API __attribute__ ((visibility ("default"))) +#else +/// MACRO empty for non-shared-library case. +#define qprintf_API +#endif + +/** + * @defgroup Masks Common masks controlling which bytes to display. + * @{ + */ + +/// Display all bytes +#define QPRINTF_MASK_ALL -1ull + +/// Display no bytes +#define QPRINTF_MASK_NONE 0ull + +/// Display even bytes +#define QPRINTF_MASK_EVEN_8 0x5555555555555555ull + +/// Display odd bytes +#define QPRINTF_MASK_ODD_8 0xaaaaaaaaaaaaaaaaull + +/// Display even 16-bit elements +#define QPRINTF_MASK_EVEN_16 0x3333333333333333ull + +/// Display odd 16-bit elements +#define QPRINTF_MASK_ODD_16 0xccccccccccccccccull + +/// Display even 32-bit elements +#define QPRINTF_MASK_EVEN_32 0x0f0f0f0f0f0f0f0full + +/// Display odd 32-bit elements +#define QPRINTF_MASK_ODD_32 0xf0f0f0f0f0f0f0f0ull + +/** + * @} + */ + +/** + * @defgroup C_functions qprintf functions + * @{ + */ +//--------------------------------------------------------------------------- +/// @brief +/// Set the mask controlling which bytes to display when printing out an HVX +/// register. +/// +/// If the nth bit of mask is set, the nth byte of HVX will be displayed. +/// When printing HVX as 16-bit or 32-bit elements, only the bit corresponding +/// to the lowest byte of the element controls whether the element will be +/// printed out or not. +/// +/// @param high +/// Mask for upper 64 bytes of HVX vector. +/// +/// @param low +/// Mask for lower 64 bytes of HVX vector. +/// +/// @return +/// None. +/// +/// Example: +/// +/// * Display the 32-bit odd elements of the 64 most significant bytes and the even +/// bytes of the 64 least significant bytes of HVX vectors printed with option %%m. +/// @code +/// // From C before invoking your assembly routine +/// qprintf_set_mask(QPRINTF_MASK_ODD_32,QPRINTF_MASK_EVEN_8); +/// +/// // From assembly +/// qprintf("v0: %mx",v0); +/// @endcode +/// +/// See also \ref assembly-hvx-registers for more assembly examples using %%m. +//--------------------------------------------------------------------------- +qprintf_API void qprintf_set_mask(unsigned long long high, unsigned long long low); + +//--------------------------------------------------------------------------- +/// @brief +/// Print a V register. +/// +/// @param msg +/// Character string used to display V register. +/// +/// @param V +/// HVX vector register to display. +/// +/// @return +/// None. +/// +/// Example: See \ref c-hvx-registers for usage examples. +//--------------------------------------------------------------------------- +void qprintf_V(const char* msg, HVX_Vector V); + +//--------------------------------------------------------------------------- +/// @brief +/// Print a Q register. see documentation for details on supported format. +/// +/// @param msg +/// Character string used to display Q register. +/// +/// @param Q +/// HVX predicate register to display. +/// +/// @return +/// None. +/// +/// Example: See \ref c-predicate-registers for usage examples. +//--------------------------------------------------------------------------- +void qprintf_Q(const char* msg, HVX_VectorPred Q); + +//--------------------------------------------------------------------------- +/// @brief +/// Display all HVX registers. +/// +/// @return +/// None. +/// +/// Example: See \ref c-register-dump for usage examples. +//--------------------------------------------------------------------------- +extern qprintf_API void qprintf_V_all(void); + + +//--------------------------------------------------------------------------- +/// @brief +/// Display all scalar registers. +/// +/// @return +/// None. +/// +/// Example: See \ref c-register-dump for usage examples. +//--------------------------------------------------------------------------- +extern qprintf_API void qprintf_R_all(void); + +/** + * @} + */ + +#ifdef __cplusplus +} +#endif + +#endif // #ifndef qprintf_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf_asm.h b/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf_asm.h new file mode 100755 index 0000000000000..9fb27cc8ff8d6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf_asm.h @@ -0,0 +1,73 @@ +/**============================================================================= + +@file + qprintf_asm.h + +@brief + Extend printf support to assembly. + +Copyright (c) 2017,2020 QUALCOMM Technologies Incorporated. +All Rights Reserved Qualcomm Proprietary +=============================================================================**/ + +/** + * @defgroup ASM_function qprintf routine assembly-callable + * @{ + */ + +//--------------------------------------------------------------------------- +/// @brief +/// Assembly macro for displaying registers along with a message and +/// filename[linenumber]. +/// +/// @param MSG +/// Message to display. +/// +/// @return +/// none. +/// +/// Example: See \ref assembly-support for usage examples. +//--------------------------------------------------------------------------- +#define qprintf(MSG,...) qprintf_macro #__FILE__, #__LINE__, MSG, #__VA_ARGS__ + +/** + * @} + */ + +//! @cond Doxygen_Suppress + +.set STACK_SIZE, 24 +.macro qprintf_macro FILE_NAME LINE_NUMBER MSG ARGS +.data +1: +.string "\MSG\()\0" +2: +.string "\ARGS\()\0" +3: +.string "\LINE_NUMBER\()\0" +4: +.string "\FILE_NAME\()\0" +.text + { + allocframe(#STACK_SIZE) // sp[STACK_SIZE]=r31:30 (sp refering to sp after stack allocation) + memd(r29 + #(-STACK_SIZE-8)) = r29:28 // sp[0]=r29:28 + r28 = ADD(PC,##1b@PCREL) + } { + memw(r29 + #8) = r28 // sp[8]=&msg + r28 = ADD(PC,##2b@PCREL) + } { + memw(r29 + #12) = r28 // sp[12]=&args + r28 = #\LINE_NUMBER\() + } { + memw(r29 + #16) = r28 // sp[12]=line_number + r28 = ADD(PC,##4b@PCREL) + } { + memw(r29 + #20) = r28 // sp[16]=&file_name + call qprintf_asm + } { + r28 = memw(r29 + #0) + deallocframe + } +.endm + +//! @endcond \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/confname.h new file mode 100755 index 0000000000000..d9ca3135501e3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/confname.h @@ -0,0 +1,528 @@ +#ifndef CONFNAME_H +#define CONFNAME_H +/** + @file confname.h + @brief Named literals for 'name' argument of sysconf, pathconf + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly. Instead include unistd.h. For now since + toolchain doesnt provide a hook by including bits/confname.h, we stick this + header in QuRT's sys/types.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +/* Values for the NAME argument to `pathconf' and `fpathconf'. */ +enum +{ + _PC_LINK_MAX, +#define _PC_LINK_MAX _PC_LINK_MAX + _PC_MAX_CANON, +#define _PC_MAX_CANON _PC_MAX_CANON + _PC_MAX_INPUT, +#define _PC_MAX_INPUT _PC_MAX_INPUT + _PC_NAME_MAX, +#define _PC_NAME_MAX _PC_NAME_MAX + _PC_PATH_MAX, +#define _PC_PATH_MAX _PC_PATH_MAX + _PC_PIPE_BUF, +#define _PC_PIPE_BUF _PC_PIPE_BUF + _PC_CHOWN_RESTRICTED, +#define _PC_CHOWN_RESTRICTED _PC_CHOWN_RESTRICTED + _PC_NO_TRUNC, +#define _PC_NO_TRUNC _PC_NO_TRUNC + _PC_VDISABLE, +#define _PC_VDISABLE _PC_VDISABLE + _PC_SYNC_IO, +#define _PC_SYNC_IO _PC_SYNC_IO + _PC_ASYNC_IO, +#define _PC_ASYNC_IO _PC_ASYNC_IO + _PC_PRIO_IO, +#define _PC_PRIO_IO _PC_PRIO_IO + _PC_SOCK_MAXBUF, +#define _PC_SOCK_MAXBUF _PC_SOCK_MAXBUF + _PC_FILESIZEBITS, +#define _PC_FILESIZEBITS _PC_FILESIZEBITS + _PC_REC_INCR_XFER_SIZE, +#define _PC_REC_INCR_XFER_SIZE _PC_REC_INCR_XFER_SIZE + _PC_REC_MAX_XFER_SIZE, +#define _PC_REC_MAX_XFER_SIZE _PC_REC_MAX_XFER_SIZE + _PC_REC_MIN_XFER_SIZE, +#define _PC_REC_MIN_XFER_SIZE _PC_REC_MIN_XFER_SIZE + _PC_REC_XFER_ALIGN, +#define _PC_REC_XFER_ALIGN _PC_REC_XFER_ALIGN + _PC_ALLOC_SIZE_MIN, +#define _PC_ALLOC_SIZE_MIN _PC_ALLOC_SIZE_MIN + _PC_SYMLINK_MAX, +#define _PC_SYMLINK_MAX _PC_SYMLINK_MAX + _PC_2_SYMLINKS +#define _PC_2_SYMLINKS _PC_2_SYMLINKS +}; + +/* Values for the argument to `sysconf'. */ +enum +{ + _SC_ARG_MAX, +#define _SC_ARG_MAX _SC_ARG_MAX + _SC_CHILD_MAX, +#define _SC_CHILD_MAX _SC_CHILD_MAX + _SC_CLK_TCK, +#define _SC_CLK_TCK _SC_CLK_TCK + _SC_NGROUPS_MAX, +#define _SC_NGROUPS_MAX _SC_NGROUPS_MAX + _SC_OPEN_MAX, +#define _SC_OPEN_MAX _SC_OPEN_MAX + _SC_STREAM_MAX, +#define _SC_STREAM_MAX _SC_STREAM_MAX + _SC_TZNAME_MAX, +#define _SC_TZNAME_MAX _SC_TZNAME_MAX + _SC_JOB_CONTROL, +#define _SC_JOB_CONTROL _SC_JOB_CONTROL + _SC_SAVED_IDS, +#define _SC_SAVED_IDS _SC_SAVED_IDS + _SC_REALTIME_SIGNALS, +#define _SC_REALTIME_SIGNALS _SC_REALTIME_SIGNALS + _SC_PRIORITY_SCHEDULING, +#define _SC_PRIORITY_SCHEDULING _SC_PRIORITY_SCHEDULING + _SC_TIMERS, +#define _SC_TIMERS _SC_TIMERS + _SC_ASYNCHRONOUS_IO, +#define _SC_ASYNCHRONOUS_IO _SC_ASYNCHRONOUS_IO + _SC_PRIORITIZED_IO, +#define _SC_PRIORITIZED_IO _SC_PRIORITIZED_IO + _SC_SYNCHRONIZED_IO, +#define _SC_SYNCHRONIZED_IO _SC_SYNCHRONIZED_IO + _SC_FSYNC, +#define _SC_FSYNC _SC_FSYNC + _SC_MAPPED_FILES, +#define _SC_MAPPED_FILES _SC_MAPPED_FILES + _SC_MEMLOCK, +#define _SC_MEMLOCK _SC_MEMLOCK + _SC_MEMLOCK_RANGE, +#define _SC_MEMLOCK_RANGE _SC_MEMLOCK_RANGE + _SC_MEMORY_PROTECTION, +#define _SC_MEMORY_PROTECTION _SC_MEMORY_PROTECTION + _SC_MESSAGE_PASSING, +#define _SC_MESSAGE_PASSING _SC_MESSAGE_PASSING + _SC_SEMAPHORES, +#define _SC_SEMAPHORES _SC_SEMAPHORES + _SC_SHARED_MEMORY_OBJECTS, +#define _SC_SHARED_MEMORY_OBJECTS _SC_SHARED_MEMORY_OBJECTS + _SC_AIO_LISTIO_MAX, +#define _SC_AIO_LISTIO_MAX _SC_AIO_LISTIO_MAX + _SC_AIO_MAX, +#define _SC_AIO_MAX _SC_AIO_MAX + _SC_AIO_PRIO_DELTA_MAX, +#define _SC_AIO_PRIO_DELTA_MAX _SC_AIO_PRIO_DELTA_MAX + _SC_DELAYTIMER_MAX, +#define _SC_DELAYTIMER_MAX _SC_DELAYTIMER_MAX + _SC_MQ_OPEN_MAX, +#define _SC_MQ_OPEN_MAX _SC_MQ_OPEN_MAX + _SC_MQ_PRIO_MAX, +#define _SC_MQ_PRIO_MAX _SC_MQ_PRIO_MAX + _SC_VERSION, +#define _SC_VERSION _SC_VERSION + _SC_PAGESIZE, +#define _SC_PAGESIZE _SC_PAGESIZE +#define _SC_PAGE_SIZE _SC_PAGESIZE + _SC_RTSIG_MAX, +#define _SC_RTSIG_MAX _SC_RTSIG_MAX + _SC_SEM_NSEMS_MAX, +#define _SC_SEM_NSEMS_MAX _SC_SEM_NSEMS_MAX + _SC_SEM_VALUE_MAX, +#define _SC_SEM_VALUE_MAX _SC_SEM_VALUE_MAX + _SC_SIGQUEUE_MAX, +#define _SC_SIGQUEUE_MAX _SC_SIGQUEUE_MAX + _SC_TIMER_MAX, +#define _SC_TIMER_MAX _SC_TIMER_MAX + + /* Values for the argument to `sysconf' + corresponding to _POSIX2_* symbols. */ + _SC_BC_BASE_MAX, +#define _SC_BC_BASE_MAX _SC_BC_BASE_MAX + _SC_BC_DIM_MAX, +#define _SC_BC_DIM_MAX _SC_BC_DIM_MAX + _SC_BC_SCALE_MAX, +#define _SC_BC_SCALE_MAX _SC_BC_SCALE_MAX + _SC_BC_STRING_MAX, +#define _SC_BC_STRING_MAX _SC_BC_STRING_MAX + _SC_COLL_WEIGHTS_MAX, +#define _SC_COLL_WEIGHTS_MAX _SC_COLL_WEIGHTS_MAX + _SC_EQUIV_CLASS_MAX, +#define _SC_EQUIV_CLASS_MAX _SC_EQUIV_CLASS_MAX + _SC_EXPR_NEST_MAX, +#define _SC_EXPR_NEST_MAX _SC_EXPR_NEST_MAX + _SC_LINE_MAX, +#define _SC_LINE_MAX _SC_LINE_MAX + _SC_RE_DUP_MAX, +#define _SC_RE_DUP_MAX _SC_RE_DUP_MAX + _SC_CHARCLASS_NAME_MAX, +#define _SC_CHARCLASS_NAME_MAX _SC_CHARCLASS_NAME_MAX + + _SC_2_VERSION, +#define _SC_2_VERSION _SC_2_VERSION + _SC_2_C_BIND, +#define _SC_2_C_BIND _SC_2_C_BIND + _SC_2_C_DEV, +#define _SC_2_C_DEV _SC_2_C_DEV + _SC_2_FORT_DEV, +#define _SC_2_FORT_DEV _SC_2_FORT_DEV + _SC_2_FORT_RUN, +#define _SC_2_FORT_RUN _SC_2_FORT_RUN + _SC_2_SW_DEV, +#define _SC_2_SW_DEV _SC_2_SW_DEV + _SC_2_LOCALEDEF, +#define _SC_2_LOCALEDEF _SC_2_LOCALEDEF + + _SC_PII, +#define _SC_PII _SC_PII + _SC_PII_XTI, +#define _SC_PII_XTI _SC_PII_XTI + _SC_PII_SOCKET, +#define _SC_PII_SOCKET _SC_PII_SOCKET + _SC_PII_INTERNET, +#define _SC_PII_INTERNET _SC_PII_INTERNET + _SC_PII_OSI, +#define _SC_PII_OSI _SC_PII_OSI + _SC_POLL, +#define _SC_POLL _SC_POLL + _SC_SELECT, +#define _SC_SELECT _SC_SELECT + _SC_UIO_MAXIOV, +#define _SC_UIO_MAXIOV _SC_UIO_MAXIOV + _SC_IOV_MAX = _SC_UIO_MAXIOV, +#define _SC_IOV_MAX _SC_IOV_MAX + _SC_PII_INTERNET_STREAM, +#define _SC_PII_INTERNET_STREAM _SC_PII_INTERNET_STREAM + _SC_PII_INTERNET_DGRAM, +#define _SC_PII_INTERNET_DGRAM _SC_PII_INTERNET_DGRAM + _SC_PII_OSI_COTS, +#define _SC_PII_OSI_COTS _SC_PII_OSI_COTS + _SC_PII_OSI_CLTS, +#define _SC_PII_OSI_CLTS _SC_PII_OSI_CLTS + _SC_PII_OSI_M, +#define _SC_PII_OSI_M _SC_PII_OSI_M + _SC_T_IOV_MAX, +#define _SC_T_IOV_MAX _SC_T_IOV_MAX + + /* Values according to POSIX 1003.1c (POSIX threads). */ + _SC_THREADS, +#define _SC_THREADS _SC_THREADS + _SC_THREAD_SAFE_FUNCTIONS, +#define _SC_THREAD_SAFE_FUNCTIONS _SC_THREAD_SAFE_FUNCTIONS + _SC_GETGR_R_SIZE_MAX, +#define _SC_GETGR_R_SIZE_MAX _SC_GETGR_R_SIZE_MAX + _SC_GETPW_R_SIZE_MAX, +#define _SC_GETPW_R_SIZE_MAX _SC_GETPW_R_SIZE_MAX + _SC_LOGIN_NAME_MAX, +#define _SC_LOGIN_NAME_MAX _SC_LOGIN_NAME_MAX + _SC_TTY_NAME_MAX, +#define _SC_TTY_NAME_MAX _SC_TTY_NAME_MAX + _SC_THREAD_DESTRUCTOR_ITERATIONS, +#define _SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS + _SC_THREAD_KEYS_MAX, +#define _SC_THREAD_KEYS_MAX _SC_THREAD_KEYS_MAX + _SC_THREAD_STACK_MIN, +#define _SC_THREAD_STACK_MIN _SC_THREAD_STACK_MIN + _SC_THREAD_THREADS_MAX, +#define _SC_THREAD_THREADS_MAX _SC_THREAD_THREADS_MAX + _SC_THREAD_ATTR_STACKADDR, +#define _SC_THREAD_ATTR_STACKADDR _SC_THREAD_ATTR_STACKADDR + _SC_THREAD_ATTR_STACKSIZE, +#define _SC_THREAD_ATTR_STACKSIZE _SC_THREAD_ATTR_STACKSIZE + _SC_THREAD_PRIORITY_SCHEDULING, +#define _SC_THREAD_PRIORITY_SCHEDULING _SC_THREAD_PRIORITY_SCHEDULING + _SC_THREAD_PRIO_INHERIT, +#define _SC_THREAD_PRIO_INHERIT _SC_THREAD_PRIO_INHERIT + _SC_THREAD_PRIO_PROTECT, +#define _SC_THREAD_PRIO_PROTECT _SC_THREAD_PRIO_PROTECT + _SC_THREAD_PROCESS_SHARED, +#define _SC_THREAD_PROCESS_SHARED _SC_THREAD_PROCESS_SHARED + + _SC_NPROCESSORS_CONF, +#define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_CONF + _SC_NPROCESSORS_ONLN, +#define _SC_NPROCESSORS_ONLN _SC_NPROCESSORS_ONLN + _SC_PHYS_PAGES, +#define _SC_PHYS_PAGES _SC_PHYS_PAGES + _SC_AVPHYS_PAGES, +#define _SC_AVPHYS_PAGES _SC_AVPHYS_PAGES + _SC_ATEXIT_MAX, +#define _SC_ATEXIT_MAX _SC_ATEXIT_MAX + _SC_PASS_MAX, +#define _SC_PASS_MAX _SC_PASS_MAX + + _SC_XOPEN_VERSION, +#define _SC_XOPEN_VERSION _SC_XOPEN_VERSION + _SC_XOPEN_XCU_VERSION, +#define _SC_XOPEN_XCU_VERSION _SC_XOPEN_XCU_VERSION + _SC_XOPEN_UNIX, +#define _SC_XOPEN_UNIX _SC_XOPEN_UNIX + _SC_XOPEN_CRYPT, +#define _SC_XOPEN_CRYPT _SC_XOPEN_CRYPT + _SC_XOPEN_ENH_I18N, +#define _SC_XOPEN_ENH_I18N _SC_XOPEN_ENH_I18N + _SC_XOPEN_SHM, +#define _SC_XOPEN_SHM _SC_XOPEN_SHM + + _SC_2_CHAR_TERM, +#define _SC_2_CHAR_TERM _SC_2_CHAR_TERM + _SC_2_C_VERSION, +#define _SC_2_C_VERSION _SC_2_C_VERSION + _SC_2_UPE, +#define _SC_2_UPE _SC_2_UPE + + _SC_XOPEN_XPG2, +#define _SC_XOPEN_XPG2 _SC_XOPEN_XPG2 + _SC_XOPEN_XPG3, +#define _SC_XOPEN_XPG3 _SC_XOPEN_XPG3 + _SC_XOPEN_XPG4, +#define _SC_XOPEN_XPG4 _SC_XOPEN_XPG4 + + _SC_CHAR_BIT, +#define _SC_CHAR_BIT _SC_CHAR_BIT + _SC_CHAR_MAX, +#define _SC_CHAR_MAX _SC_CHAR_MAX + _SC_CHAR_MIN, +#define _SC_CHAR_MIN _SC_CHAR_MIN + _SC_INT_MAX, +#define _SC_INT_MAX _SC_INT_MAX + _SC_INT_MIN, +#define _SC_INT_MIN _SC_INT_MIN + _SC_LONG_BIT, +#define _SC_LONG_BIT _SC_LONG_BIT + _SC_WORD_BIT, +#define _SC_WORD_BIT _SC_WORD_BIT + _SC_MB_LEN_MAX, +#define _SC_MB_LEN_MAX _SC_MB_LEN_MAX + _SC_NZERO, +#define _SC_NZERO _SC_NZERO + _SC_SSIZE_MAX, +#define _SC_SSIZE_MAX _SC_SSIZE_MAX + _SC_SCHAR_MAX, +#define _SC_SCHAR_MAX _SC_SCHAR_MAX + _SC_SCHAR_MIN, +#define _SC_SCHAR_MIN _SC_SCHAR_MIN + _SC_SHRT_MAX, +#define _SC_SHRT_MAX _SC_SHRT_MAX + _SC_SHRT_MIN, +#define _SC_SHRT_MIN _SC_SHRT_MIN + _SC_UCHAR_MAX, +#define _SC_UCHAR_MAX _SC_UCHAR_MAX + _SC_UINT_MAX, +#define _SC_UINT_MAX _SC_UINT_MAX + _SC_ULONG_MAX, +#define _SC_ULONG_MAX _SC_ULONG_MAX + _SC_USHRT_MAX, +#define _SC_USHRT_MAX _SC_USHRT_MAX + + _SC_NL_ARGMAX, +#define _SC_NL_ARGMAX _SC_NL_ARGMAX + _SC_NL_LANGMAX, +#define _SC_NL_LANGMAX _SC_NL_LANGMAX + _SC_NL_MSGMAX, +#define _SC_NL_MSGMAX _SC_NL_MSGMAX + _SC_NL_NMAX, +#define _SC_NL_NMAX _SC_NL_NMAX + _SC_NL_SETMAX, +#define _SC_NL_SETMAX _SC_NL_SETMAX + _SC_NL_TEXTMAX, +#define _SC_NL_TEXTMAX _SC_NL_TEXTMAX + + _SC_XBS5_ILP32_OFF32, +#define _SC_XBS5_ILP32_OFF32 _SC_XBS5_ILP32_OFF32 + _SC_XBS5_ILP32_OFFBIG, +#define _SC_XBS5_ILP32_OFFBIG _SC_XBS5_ILP32_OFFBIG + _SC_XBS5_LP64_OFF64, +#define _SC_XBS5_LP64_OFF64 _SC_XBS5_LP64_OFF64 + _SC_XBS5_LPBIG_OFFBIG, +#define _SC_XBS5_LPBIG_OFFBIG _SC_XBS5_LPBIG_OFFBIG + + _SC_XOPEN_LEGACY, +#define _SC_XOPEN_LEGACY _SC_XOPEN_LEGACY + _SC_XOPEN_REALTIME, +#define _SC_XOPEN_REALTIME _SC_XOPEN_REALTIME + _SC_XOPEN_REALTIME_THREADS, +#define _SC_XOPEN_REALTIME_THREADS _SC_XOPEN_REALTIME_THREADS + + _SC_ADVISORY_INFO, +#define _SC_ADVISORY_INFO _SC_ADVISORY_INFO + _SC_BARRIERS, +#define _SC_BARRIERS _SC_BARRIERS + _SC_BASE, +#define _SC_BASE _SC_BASE + _SC_C_LANG_SUPPORT, +#define _SC_C_LANG_SUPPORT _SC_C_LANG_SUPPORT + _SC_C_LANG_SUPPORT_R, +#define _SC_C_LANG_SUPPORT_R _SC_C_LANG_SUPPORT_R + _SC_CLOCK_SELECTION, +#define _SC_CLOCK_SELECTION _SC_CLOCK_SELECTION + _SC_CPUTIME, +#define _SC_CPUTIME _SC_CPUTIME + _SC_THREAD_CPUTIME, +#define _SC_THREAD_CPUTIME _SC_THREAD_CPUTIME + _SC_DEVICE_IO, +#define _SC_DEVICE_IO _SC_DEVICE_IO + _SC_DEVICE_SPECIFIC, +#define _SC_DEVICE_SPECIFIC _SC_DEVICE_SPECIFIC + _SC_DEVICE_SPECIFIC_R, +#define _SC_DEVICE_SPECIFIC_R _SC_DEVICE_SPECIFIC_R + _SC_FD_MGMT, +#define _SC_FD_MGMT _SC_FD_MGMT + _SC_FIFO, +#define _SC_FIFO _SC_FIFO + _SC_PIPE, +#define _SC_PIPE _SC_PIPE + _SC_FILE_ATTRIBUTES, +#define _SC_FILE_ATTRIBUTES _SC_FILE_ATTRIBUTES + _SC_FILE_LOCKING, +#define _SC_FILE_LOCKING _SC_FILE_LOCKING + _SC_FILE_SYSTEM, +#define _SC_FILE_SYSTEM _SC_FILE_SYSTEM + _SC_MONOTONIC_CLOCK, +#define _SC_MONOTONIC_CLOCK _SC_MONOTONIC_CLOCK + _SC_MULTI_PROCESS, +#define _SC_MULTI_PROCESS _SC_MULTI_PROCESS + _SC_SINGLE_PROCESS, +#define _SC_SINGLE_PROCESS _SC_SINGLE_PROCESS + _SC_NETWORKING, +#define _SC_NETWORKING _SC_NETWORKING + _SC_READER_WRITER_LOCKS, +#define _SC_READER_WRITER_LOCKS _SC_READER_WRITER_LOCKS + _SC_SPIN_LOCKS, +#define _SC_SPIN_LOCKS _SC_SPIN_LOCKS + _SC_REGEXP, +#define _SC_REGEXP _SC_REGEXP + _SC_REGEX_VERSION, +#define _SC_REGEX_VERSION _SC_REGEX_VERSION + _SC_SHELL, +#define _SC_SHELL _SC_SHELL + _SC_SIGNALS, +#define _SC_SIGNALS _SC_SIGNALS + _SC_SPAWN, +#define _SC_SPAWN _SC_SPAWN + _SC_SPORADIC_SERVER, +#define _SC_SPORADIC_SERVER _SC_SPORADIC_SERVER + _SC_THREAD_SPORADIC_SERVER, +#define _SC_THREAD_SPORADIC_SERVER _SC_THREAD_SPORADIC_SERVER + _SC_SYSTEM_DATABASE, +#define _SC_SYSTEM_DATABASE _SC_SYSTEM_DATABASE + _SC_SYSTEM_DATABASE_R, +#define _SC_SYSTEM_DATABASE_R _SC_SYSTEM_DATABASE_R + _SC_TIMEOUTS, +#define _SC_TIMEOUTS _SC_TIMEOUTS + _SC_TYPED_MEMORY_OBJECTS, +#define _SC_TYPED_MEMORY_OBJECTS _SC_TYPED_MEMORY_OBJECTS + _SC_USER_GROUPS, +#define _SC_USER_GROUPS _SC_USER_GROUPS + _SC_USER_GROUPS_R, +#define _SC_USER_GROUPS_R _SC_USER_GROUPS_R + _SC_2_PBS, +#define _SC_2_PBS _SC_2_PBS + _SC_2_PBS_ACCOUNTING, +#define _SC_2_PBS_ACCOUNTING _SC_2_PBS_ACCOUNTING + _SC_2_PBS_LOCATE, +#define _SC_2_PBS_LOCATE _SC_2_PBS_LOCATE + _SC_2_PBS_MESSAGE, +#define _SC_2_PBS_MESSAGE _SC_2_PBS_MESSAGE + _SC_2_PBS_TRACK, +#define _SC_2_PBS_TRACK _SC_2_PBS_TRACK + _SC_SYMLOOP_MAX, +#define _SC_SYMLOOP_MAX _SC_SYMLOOP_MAX + _SC_STREAMS, +#define _SC_STREAMS _SC_STREAMS + _SC_2_PBS_CHECKPOINT, +#define _SC_2_PBS_CHECKPOINT _SC_2_PBS_CHECKPOINT + + _SC_V6_ILP32_OFF32, +#define _SC_V6_ILP32_OFF32 _SC_V6_ILP32_OFF32 + _SC_V6_ILP32_OFFBIG, +#define _SC_V6_ILP32_OFFBIG _SC_V6_ILP32_OFFBIG + _SC_V6_LP64_OFF64, +#define _SC_V6_LP64_OFF64 _SC_V6_LP64_OFF64 + _SC_V6_LPBIG_OFFBIG, +#define _SC_V6_LPBIG_OFFBIG _SC_V6_LPBIG_OFFBIG + + _SC_HOST_NAME_MAX, +#define _SC_HOST_NAME_MAX _SC_HOST_NAME_MAX + _SC_TRACE, +#define _SC_TRACE _SC_TRACE + _SC_TRACE_EVENT_FILTER, +#define _SC_TRACE_EVENT_FILTER _SC_TRACE_EVENT_FILTER + _SC_TRACE_INHERIT, +#define _SC_TRACE_INHERIT _SC_TRACE_INHERIT + _SC_TRACE_LOG, +#define _SC_TRACE_LOG _SC_TRACE_LOG + + _SC_LEVEL1_ICACHE_SIZE, +#define _SC_LEVEL1_ICACHE_SIZE _SC_LEVEL1_ICACHE_SIZE + _SC_LEVEL1_ICACHE_ASSOC, +#define _SC_LEVEL1_ICACHE_ASSOC _SC_LEVEL1_ICACHE_ASSOC + _SC_LEVEL1_ICACHE_LINESIZE, +#define _SC_LEVEL1_ICACHE_LINESIZE _SC_LEVEL1_ICACHE_LINESIZE + _SC_LEVEL1_DCACHE_SIZE, +#define _SC_LEVEL1_DCACHE_SIZE _SC_LEVEL1_DCACHE_SIZE + _SC_LEVEL1_DCACHE_ASSOC, +#define _SC_LEVEL1_DCACHE_ASSOC _SC_LEVEL1_DCACHE_ASSOC + _SC_LEVEL1_DCACHE_LINESIZE, +#define _SC_LEVEL1_DCACHE_LINESIZE _SC_LEVEL1_DCACHE_LINESIZE + _SC_LEVEL2_CACHE_SIZE, +#define _SC_LEVEL2_CACHE_SIZE _SC_LEVEL2_CACHE_SIZE + _SC_LEVEL2_CACHE_ASSOC, +#define _SC_LEVEL2_CACHE_ASSOC _SC_LEVEL2_CACHE_ASSOC + _SC_LEVEL2_CACHE_LINESIZE, +#define _SC_LEVEL2_CACHE_LINESIZE _SC_LEVEL2_CACHE_LINESIZE + _SC_LEVEL3_CACHE_SIZE, +#define _SC_LEVEL3_CACHE_SIZE _SC_LEVEL3_CACHE_SIZE + _SC_LEVEL3_CACHE_ASSOC, +#define _SC_LEVEL3_CACHE_ASSOC _SC_LEVEL3_CACHE_ASSOC + _SC_LEVEL3_CACHE_LINESIZE, +#define _SC_LEVEL3_CACHE_LINESIZE _SC_LEVEL3_CACHE_LINESIZE + _SC_LEVEL4_CACHE_SIZE, +#define _SC_LEVEL4_CACHE_SIZE _SC_LEVEL4_CACHE_SIZE + _SC_LEVEL4_CACHE_ASSOC, +#define _SC_LEVEL4_CACHE_ASSOC _SC_LEVEL4_CACHE_ASSOC + _SC_LEVEL4_CACHE_LINESIZE, +#define _SC_LEVEL4_CACHE_LINESIZE _SC_LEVEL4_CACHE_LINESIZE + /* Leave room here, maybe we need a few more cache levels some day. */ + + _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50, +#define _SC_IPV6 _SC_IPV6 + _SC_RAW_SOCKETS, +#define _SC_RAW_SOCKETS _SC_RAW_SOCKETS + + _SC_V7_ILP32_OFF32, +#define _SC_V7_ILP32_OFF32 _SC_V7_ILP32_OFF32 + _SC_V7_ILP32_OFFBIG, +#define _SC_V7_ILP32_OFFBIG _SC_V7_ILP32_OFFBIG + _SC_V7_LP64_OFF64, +#define _SC_V7_LP64_OFF64 _SC_V7_LP64_OFF64 + _SC_V7_LPBIG_OFFBIG, +#define _SC_V7_LPBIG_OFFBIG _SC_V7_LPBIG_OFFBIG + + _SC_SS_REPL_MAX, +#define _SC_SS_REPL_MAX _SC_SS_REPL_MAX + + _SC_TRACE_EVENT_NAME_MAX, +#define _SC_TRACE_EVENT_NAME_MAX _SC_TRACE_EVENT_NAME_MAX + _SC_TRACE_NAME_MAX, +#define _SC_TRACE_NAME_MAX _SC_TRACE_NAME_MAX + _SC_TRACE_SYS_MAX, +#define _SC_TRACE_SYS_MAX _SC_TRACE_SYS_MAX + _SC_TRACE_USER_EVENT_MAX, +#define _SC_TRACE_USER_EVENT_MAX _SC_TRACE_USER_EVENT_MAX + + _SC_XOPEN_STREAMS, +#define _SC_XOPEN_STREAMS _SC_XOPEN_STREAMS + + _SC_THREAD_ROBUST_PRIO_INHERIT, +#define _SC_THREAD_ROBUST_PRIO_INHERIT _SC_THREAD_ROBUST_PRIO_INHERIT + _SC_THREAD_ROBUST_PRIO_PROTECT +#define _SC_THREAD_ROBUST_PRIO_PROTECT _SC_THREAD_ROBUST_PRIO_PROTECT + +}; +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/posix1_lim.h new file mode 100755 index 0000000000000..0739958c5a6c4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/posix1_lim.h @@ -0,0 +1,34 @@ +#ifndef POSIX1_LIM_H +#define POSIX1_LIM_H +/** + @file posix1_lim.h + @brief POSIX Minimum values + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +TODO + This header should be ideally relocated under api/posix/bits (something that + doesnt exist today) and be included from api/posix/bits/limits.h which inturn + should be included from toolchain's limits.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +#ifndef _POSIX_PATH_MAX +/** @brief Maximum number of bytes in a pathname, including the terminating + nul character */ +#define _POSIX_PATH_MAX 256 +#endif + +#ifndef _POSIX_SEM_NSEMS_MAX +/** @brief Maximum number of semaphores that a process may have */ +#define _POSIX_SEM_NSEMS_MAX 16 +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/common/time.h new file mode 100755 index 0000000000000..76b0d39ab7039 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/common/time.h @@ -0,0 +1 @@ +#include \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/fcntl.h new file mode 100755 index 0000000000000..c80ec98a449b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/fcntl.h @@ -0,0 +1,51 @@ +#ifndef _FCNTL_H +#define _FCNTL_H + +/*========================================================================== + * FILE: fcntl.h + * + * SERVICES: POSIX fcntl.h + * + * DESCRIPTION: The header is needed by the open() and fcntl() + * system calls, which have a variety of parameters and + * flags. They are described here. + * + * The formats of the calls to each of these are: + * + * open(path, oflag [,mode]) open a file + * fcntl(fd, cmd [,arg]) get or set file attributes + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Oflag values for open(). POSIX Table 6-4. */ +#define POSIX_O_CREAT 0x100 /* creat file if it doesn't exist */ +#define POSIX_O_EXCL 0x200 /* exclusive use flag */ +#define POSIX_O_NOCTTY 0x400 /* do not assign a controlling terminal */ +#define POSIX_O_TRUNC 0x1000 /* truncate flag */ + +/* File status flags for open() and fcntl(). POSIX Table 6-5. */ +#define POSIX_O_APPEND 0x2000 /* set append mode */ +#define POSIX_O_NONBLOCK 0x4000 /* no delay */ + +/* File access modes for open() and fcntl(). POSIX Table 6-6. */ +#define POSIX_O_RDONLY 0 /* open(name, POSIX_O_RDONLY) opens read only */ +#define POSIX_O_WRONLY 1 /* open(name, POSIX_O_WRONLY) opens write only */ +#define POSIX_O_RDWR 2 /* open(name, POSIX_O_RDWR) opens read/write */ + +/* Mask for use with file access modes. POSIX Table 6-7. */ +#define POSIX_O_ACCMODE 0x3 /* mask for file access modes */ + +#ifdef __cplusplus +} +#endif + +#endif /* _FCNTL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/hooks/unistd.h new file mode 100755 index 0000000000000..1c618bfe36b4f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/hooks/unistd.h @@ -0,0 +1,115 @@ +#ifndef UNISTD_H +#define UNISTD_H +/** + @file posix/hooks/unistd.h + @brief POSIX related declarations in that are missing in toolchain + header + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly! Instead include unistd.h. + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include /* For various POSIX ID types from toolchain headers */ + +#ifdef __cplusplus +extern "C" { +#endif +extern long pathconf (char const * path, int name); + +/* Process*/ + +/** The getppid() function shall return the parent process ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the parent process ID + */ +pid_t getppid(void); + +/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid + * Please refer to POSIX standard for details. + * @param thread [in] process ID + * @param value_ptr [out] process group ID + */ +pid_t getpgid(pid_t pid); + +/** The getpgrp() function shall return the process group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] process group ID of the calling process + */ +pid_t getpgrp(void); + +/**The getuid() function shall return the real user ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the real user ID of the calling process. + */ +uid_t getuid(void); + +/** The geteuid() function shall return the effective user ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective user ID of the calling process + */ +uid_t geteuid(void); + +/** The getegid() function shall return the effective group ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective group ID of the calling process. + */ +gid_t getegid(void); + +/** The getgid() function shall return the real group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] real group ID of the calling process. + */ + gid_t getgid(void); + +/** seteuid set effective user ID + * Please refer to POSIX standard for details. + * @param thread [in] effective user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int seteuid(uid_t uid); + +/** setpgrp - set the process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setpgrp(void); + +/** setuid - set user ID + * Please refer to POSIX standard for details. + * @param thread [in] user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setuid(uid_t uid); + +/** setpgid - set process group ID for job control + * Please refer to POSIX standard for details. + * @param thread [in] PID of process, PGID to be set + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setpgid(pid_t pid, pid_t pgid); + +/** setsid - create session and set process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setsid(void); + +#ifdef __cplusplus +} +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/mqueue.h new file mode 100755 index 0000000000000..74dcc2fa202c6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/mqueue.h @@ -0,0 +1,203 @@ +#ifndef _POSIX_MQUEUE_H_ +#define _POSIX_MQUEUE_H_ + +/*========================================================================== + * FILE: mqueue.h + * + * SERVICES: POSIX Message Queue API interface + * + * DESCRIPTION: POSIX Message Queue API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technlogies, Inc. + *==========================================================================*/ + +#include /*ssize_t */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MQ_PRIO_MAX 255 /* max priority */ +#define MQ_PRIO_DEFAULT 0 /* default priority */ + +typedef int mqd_t; + +struct mq_attr +{ + long mq_flags; /* message queue flags */ + long mq_maxmsg; /* maximum number of messages */ + long mq_msgsize; /* maximum message size */ + long mq_curmsgs; /* number of messages currently queued */ +}; + +typedef struct mq_attr mqueue_attr; + +/** \details + * This provides POSIX Message Queue API. + * + * mq_notify is not supported. + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * it only supports Message sending and receiving within one process. + * Message sending and receiving among processes are not supported. + */ + +/** \defgroup mqueue POSIX Message Queue API */ +/** \ingroup mqueue */ +/** @{ */ + +/** Open a message queue. + * Please refer to POSIX standard for details. + */ +mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...); + +/** Close a message queue. + * Please refer to POSIX standard for details. + */ +int mq_close(mqd_t mq_desc); + +/** Remove a message queue. + * Please refer to POSIX standard for details. + */ +int mq_unlink(const char *name); + +/** Send a message to a message queue. + * Please refer to POSIX standard for details. + * + * If the queue is full, instead of blocking the sender, this function + * will return -1 with errno EAGAIN, in this implementation. This behavior + * may change in the future. + */ +int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio); + +/** Send a message to a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); + +/** Receive a message from a message queue. + * Please refer to POSIX standard for details. + */ +ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio); + +/** Receive a message from a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout); + +/** Get message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat); + +/** Set message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat); + +/** @} */ + +#define NBBY 8U /* number of bits in a byte */ + +/* + * Select uses bit masks of file descriptors in longs. These macros + * manipulate such bit fields (the filesystem macros use chars). + * FD_SETSIZE may be defined by the user, but the default here should + * be enough for most uses. + */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 256U +#endif + +typedef unsigned long fd_mask; +#define NFDBITS (sizeof(fd_mask) * (unsigned int)NBBY) /* bits per mask */ + +#ifndef howmany +#define howmany(x, y) (((x) + ((y) - 1U)) / (y)) +#endif + +//equivalent of fd_set fpr WINNT env +typedef struct fd_set +{ + fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)]; +} fd_set; + +/** \addtogroup mqueue */ +/** @{ */ + +/** Sets the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Clears the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise. + */ +#define FD_ISSET(n, p) ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS))) + +/** Copies the file descriptor set. + */ +#define FD_COPY(f, t) (void)(memcpy)((t), (f), sizeof(*(f))) + +/** Initializes the file descriptor set fdset to have zero bits for all file descriptors. + */ +#define FD_ZERO(p) (void)memset((p), 0, sizeof(*(p))) + +/** Error check the file descriptor set. + */ +#define FD_BAD(fd) ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/) + +/*! Wait for both message queues and signals. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int pselect(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + const struct timespec *restrict timeout, + const sigset_t *restrict sigmask); + +/*! Wait for multiple message queues. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int select(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + struct timeval *restrict timeout); + +/** @} */ + +/* this function is needed for test framework which needs to clean up memory when teardown */ +void _mq_teardown(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread.h new file mode 100755 index 0000000000000..f64242e8dc683 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread.h @@ -0,0 +1,287 @@ +#ifndef QURT_PTHREAD_H +#define QURT_PTHREAD_H + +/*========================================================================== + * FILE: pthread.h + * + * SERVICES: POSIX pthread API interface + * + * DESCRIPTION: POSIX pthread API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016,2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *========================================================================== + * + * EDIT HISTORY FOR MODULE + * + * This section contains comments describing changes made to the module. + * Notice that changes are listed in reverse chronological order. + * + * + * + * when who what, where, why + * -------- --- ------------------------------------------------------- + * 10/13/08 cz Initial version. + *==========================================================================*/ + +#include +#include "sys/sched.h" /* For struct sched_param */ +#include "sys/errno.h" /* error values */ +#include +#include +#include +#include +#include +#include "pthread_types.h" +#ifdef __cplusplus +extern "C" { +#endif + +/* the range of the set supported by the kernel data type used to represent CPU sets. */ +#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL + +#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS) static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); } + +/** @brief Magic (non-portable) value for a stack's address to enable usage + of auto-stack feature (if available) */ +#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF) + +/** \details + * This provides POSIX thread API. + * + */ + +/** \defgroup pthread POSIX pthread API */ +/** \ingroup pthread */ +/** @{ */ + +/** Compare Two Threads. + * Please refer to POSIX standard for details. + */ +static inline int pthread_equal(pthread_t t1, pthread_t t2) +{ + return (t1 == t2) ? 1 : 0; +} + +/** Create Thread. + * Please refer to POSIX standard for details. + */ +int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg); + +/** Terminate Calling Thread. + * Please refer to POSIX standard for details. + */ +void pthread_exit(void *value_ptr); + +/** Wait for thread termination. + * Please refer to POSIX standard for details. + * @param thread [in] the thread to be joined + * @param value_ptr [out] the pointer of the exit status + */ +int pthread_join(pthread_t thread, void **value_ptr); + +/** Detach a joinable thread. + * Please refer to POSIX standard for details. + * @param id [in] id of the tread the thread to be detached. + */ +int pthread_detach(pthread_t id); + +/** Dynamic package initialisation + * Please refer to POSIX standard for details. + */ +int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)); + +pthread_t pthread_self(void); +int pthread_cancel(pthread_t thread); +static inline void pthread_yield(void) +{ + return; +} + +int pthread_kill(pthread_t thread, int sig); + +/** + * @brief Return name of thread + * @warning Donot call this in the error handling path as it may cause deadlock + * due to underlying OS calls + * @param thread [in] thread Thread whose name is to be retrieved + * @param name [out] name Buffer used to return thread name + * @param len [in] len Number of bytes available in name + * @return 0 on success, ESRCH, ERANGE on failure + */ +extern int pthread_getname_np (pthread_t thread, char * name, size_t len); + +int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param); +int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param); +int pthread_setschedprio(pthread_t thread, int prio); +int pthread_setcancelstate(int state, int *oldstate); +int pthread_setcanceltype(int type, int *oldtype); + +/* Attribute functions */ +int pthread_attr_init(pthread_attr_t *attr); +int pthread_attr_destroy(pthread_attr_t *attr); +int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param); +int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param); +int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize); +int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize); +int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr); +int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr); +int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate); +int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate); +int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize); +int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize); +int pthread_attr_setscope(pthread_attr_t *attr, int scope); +int pthread_attr_getscope(const pthread_attr_t *attr, int *scope); +int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched); +int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched); +int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize); +int pthread_attr_setautostack(pthread_attr_t *attr); +int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority); + +/* Qualcomm additions to pthread get/set attribute functions */ +int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name); +int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size); +int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid); +int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid); + +/* Mutexes */ +int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr); +int pthread_mutex_lock(pthread_mutex_t *mutex); +int pthread_mutex_unlock(pthread_mutex_t *mutex); +int pthread_mutex_trylock(pthread_mutex_t *mutex); +int pthread_mutex_destroy(pthread_mutex_t *mutex); +int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling); +int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling); + +/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not + * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support + * this kind of Mutex */ +int pthread_mutexattr_init(pthread_mutexattr_t *attr); +int pthread_mutexattr_destroy(pthread_mutexattr_t *attr); +int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type); +int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol); +int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int); +int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling); +int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling); + +/* Spinlocks */ +int pthread_spin_init(pthread_spinlock_t *lock, int pshared); +int pthread_spin_destroy(pthread_spinlock_t *lock); +int pthread_spin_lock(pthread_spinlock_t *lock); +int pthread_spin_trylock(pthread_spinlock_t *lock); +int pthread_spin_unlock(pthread_spinlock_t *lock); + +/* Condition variables */ +int pthread_condattr_init(pthread_condattr_t *attr); +int pthread_condattr_destroy(pthread_condattr_t *attr); +int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared); +int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared); +int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock); +int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock); +int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr); +int pthread_cond_destroy(pthread_cond_t *cond); +int pthread_cond_signal(pthread_cond_t *cond); +int pthread_cond_broadcast(pthread_cond_t *cond); +int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); +int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time); + +/* Barriers */ +int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count); +int pthread_barrier_destroy(pthread_barrier_t *barrier); +int pthread_barrier_wait(pthread_barrier_t *barrier); +int pthread_barrierattr_init(pthread_barrierattr_t *attr); +int pthread_barrierattr_destroy(pthread_barrierattr_t *attr); +int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared); + + +/*Read-Write locks*/ +int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *); +int pthread_rwlock_destroy(pthread_rwlock_t *); +int pthread_rwlockattr_init(pthread_rwlockattr_t *); +int pthread_rwlockattr_destroy(pthread_rwlockattr_t *); +int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *); +int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int); +int pthread_rwlock_rdlock(pthread_rwlock_t *); +int pthread_rwlock_tryrdlock(pthread_rwlock_t *); +int pthread_rwlock_wrlock(pthread_rwlock_t *); +int pthread_rwlock_trywrlock(pthread_rwlock_t *); +int pthread_rwlock_unlock(pthread_rwlock_t *); + + +/** please refer to POSIX standard document + */ +int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared); + +/** set CPU affinity attribute in thread attributes object. + + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [in] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpuset specified a CPU that was outside the set supported + by the kernel. (The kernel configuration option + CONFIG_NR_CPUS defines the range of the set supported by + the kernel data type used to represent CPU sets.) + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset); + +/** get CPU affinity attribute in thread attributes object. + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [out] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpusetsize is smaller than the size of the affinity mask + used by the kernel. + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset); + +/* TLS */ +int pthread_key_create(pthread_key_t *key, void (*destructor)(void*)); +int pthread_key_delete(pthread_key_t key); +int pthread_setspecific(pthread_key_t key, const void *value); +void *pthread_getspecific(pthread_key_t key); +int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); + +/** @} */ + +/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */ +int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr); +int pthread_fake_destroy(pthread_t thread); + +//amitkulk: move these to unistd.h after we move that header within qurt +int posix_memalign(void **memptr, size_t alignment, size_t size); +void exit(int status); +#ifdef __cplusplus +} +#endif + +#endif /* QURT_PTHREAD_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread_types.h new file mode 100755 index 0000000000000..51c3b9dbca243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread_types.h @@ -0,0 +1,193 @@ +#ifndef _PTHREAD_TYPES_H_ +#define _PTHREAD_TYPES_H_ + +/*========================================================================== + * FILE: pthread_types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2016, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __GNUC__ +#define restrict __restrict__ +#else +#define restrict +#endif + +#define _SSIZE_T + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#define PTHREAD_MAX_THREADS 512U + +#define PTHREAD_NAME_LEN 16 +#define PTHREAD_MIN_STACKSIZE 512 //4096 +#define PTHREAD_MAX_STACKSIZE 1048576 +#define PTHREAD_DEFAULT_STACKSIZE 16384 + +#define PTHREAD_STACK_MIN (4096U*2U) +#define PTHREAD_MIN_PRIORITY 0U +#define PTHREAD_MAX_PRIORITY 255U +#define PTHREAD_DEFAULT_PRIORITY 1 + +/*Mutex initialization status*/ +#define PTHREAD_MUTEX_ATTR_UNINITIALIZED 0 +#define PTHREAD_MUTEX_ATTR_INITIALIZED 1 + +/*Conditional attributes initialization status*/ +#define PTHREAD_COND_ATTR_UNINITIALIZED 0 +#define PTHREAD_COND_ATTR_INITIALIZED 1 + +#define PTHREAD_DEFAULT_NAME "Anonymous" + +#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t) 0xFFFFFFFFU) + +#define PTHREAD_COND_INITIALIZER ((pthread_cond_t) 0xFFFFFFFFU) + +/* mutex and cond_var shared */ +#define PTHREAD_PROCESS_PRIVATE 0 +#define PTHREAD_PROCESS_SHARED 1 + +/* mutex type */ +#define PTHREAD_MUTEX_ERRORCHECK 0 +#define PTHREAD_MUTEX_NORMAL 1 +#define PTHREAD_MUTEX_RECURSIVE 2 +#define PTHREAD_MUTEX_DEFAULT 3 + +/* mutex protocol */ +#define PTHREAD_PRIO_NONE 0 +#define PTHREAD_PRIO_INHERIT 1 +#define PTHREAD_PRIO_PROTECT 2 + +#define PTHREAD_SPINLOCK_UNLOCKED 0 +#define PTHREAD_SPINLOCK_LOCKED 1 + +#define PTHREAD_ONCE_INIT (0) + +#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug + +typedef signed int ssize_t; + +/*detatchstate of a pthread*/ +#define PTHREAD_CREATE_JOINABLE 1 +#define PTHREAD_CREATE_DETACHED 0 + +/*contention scope*/ +#define PTHREAD_SCOPE_PROCESS 1 +#define PTHREAD_SCOPE_SYSTEM 0 + +/*scheduler*/ +#define PTHREAD_INHERIT_SCHED 1 +#define PTHREAD_EXPLICIT_SCHED 0 + +/* + * Types and structure definitions + * + */ +typedef unsigned int cpu_set_t; + +typedef unsigned int pthread_t; + +typedef struct pthread_attr_t +{ + void *stackaddr; + int internal_stack; /* this flag==1 means the stack needs to be freed by posix */ + size_t stacksize; + int priority; + unsigned short timetest_id; + /* This flag indicate if thread will be autostack thread*/ + unsigned short autostack:1; + /* This flag is to indicate thread's bus_priority high/low + bus_priority = 0 -- Bus_priority is low + bus_priority = 1 -- Bus_priority is high + bus_priority = 3 -- Bus_priority is default (takes the default set for the process) + */ + unsigned short bus_priority:2; + unsigned short reserved:13; + cpu_set_t cpumask; + char name[PTHREAD_NAME_LEN]; + /* This flag indicates whether pthread lib should create thread contexts for other OSALs */ + /* This is used internally by POSIX and not available for general usage */ + int ext_context; + int detachstate; +} pthread_attr_t; + +//mutex attr +typedef struct pthread_mutexattr_t pthread_mutexattr_t; +struct pthread_mutexattr_t +{ + int is_initialized; + int type; + int pshared; + int protocol; +}; + +typedef unsigned int pthread_mutex_t; + +typedef unsigned int pthread_spinlock_t; + +typedef struct pthread_condattr_t +{ + int is_initialized; + int pshared; + clockid_t clock_id; +} pthread_condattr_t; + +typedef unsigned int pthread_cond_t; + +typedef struct pthread_barrierattr_t +{ + int is_initialized; + int pshared; +} pthread_barrierattr_t; + +typedef unsigned int pthread_barrier_t; + +typedef int pthread_key_t; + +typedef int pthread_once_t; + + +/*Read-Write locks*/ +#define PTW32_RWLOCK_MAGIC 0xfacade2 +#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1) + +struct pthread_rwlockattr_t_ +{ + int pshared; +}; + +struct pthread_rwlock_t_ +{ + pthread_mutex_t mtxExclusiveAccess; + pthread_mutex_t mtxSharedAccessCompleted; + pthread_cond_t cndSharedAccessCompleted; + int nSharedAccessCount; + int nExclusiveAccessCount; + int nCompletedSharedAccessCount; + int nMagic; +}; + +typedef struct pthread_rwlock_t_ * pthread_rwlock_t; +typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t; +#ifdef __cplusplus +} +#endif + +#endif /* _PTHERAD_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sched.h new file mode 100755 index 0000000000000..faf3365be9f82 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sched.h @@ -0,0 +1,21 @@ +/*============================================================================= + + sched.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SCHED_H__ +#define __SCHED_H__ + +#include "sys/sched.h" + +#endif //__SCHED_H__ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/semaphore.h new file mode 100755 index 0000000000000..d9145b295ae62 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/semaphore.h @@ -0,0 +1,114 @@ +#ifndef SEMAPHORE_H +#define SEMAPHORE_H + +/*========================================================================== + * FILE: semaphore.h + * + * SERVICES: POSIX semaphore API interface + * + * DESCRIPTION: POSIX semaphore API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ +#include // Get all C sys types - includes POSIX specific +#include "sys/errno.h" // error values + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** User facing semaphore container with opaque pointer to implementation */ +typedef struct +{ + unsigned int *opaque; +} sem_t; +#define _SEM_T + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* constant definitions */ +#define SEM_FAILED ((sem_t*) 0) + +/* @todo siqbal Should we put such configuration items in a common place + instead of this user-facing header? */ +#define SEM_VALUE_MAX ((unsigned int) 30) // If need be increase this + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/** \details + * POSIX standard comes with two kinds of semaphores: named and unnamed + * semaphores. + * + * This implementation of POSIX kernel API provide unnamed & named semaphore. + * + * + * sem_timedwait() is not provided. + */ + +/** \defgroup semaphore POSIX Semaphore API */ + +/** \ingroup semaphore */ +/** @{ */ + +/** Initialize an unnamed semaphore. + * Please refer to POSIX standard for details. + * @param pshared [in] This implementation does not support non-zero value, + * i.e., semaphore cannot be shared between processes in this implementation. + */ +int sem_init(sem_t *sem, int pshared, unsigned int value); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_wait(sem_t *sem); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_trywait(sem_t *sem); + +/** Unlock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_post(sem_t *sem); + +/** Get the value of a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_getvalue(sem_t *sem, int *value); + +/** Destroy an unnamed semaphore. + * Please refer to POSIX standard for details. + */ +int sem_destroy(sem_t *sem); + +/** creates and initializes a named semaphore. + * Please refer to POSIX standard for details. + */ +sem_t * sem_open(const char* name , int oflag , ...); + +/** closes a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_close(sem_t *sem); + +/** unlinkes a named semaphore. + * Please refer to POSIX standard for details. + */ +int sem_unlink(const char *name); +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* SEMAPHORE_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/signal.h new file mode 100755 index 0000000000000..35cb1f1a9a319 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/signal.h @@ -0,0 +1,201 @@ +#ifndef _SIGNAL_H_ +#define _SIGNAL_H_ + +/*========================================================================== + * FILE: signal.h + * + * SERVICES: POSIX Signal API interface + * + * DESCRIPTION: POSIX Signal API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* POSIX signal bits */ + +#define POSIX_MSG 7 /* POSIX msg type used in Qube API */ +#define POSIX_NOTIF 8 /* POSIX msg type used in Qube API */ +#define SIGKILL 9 /* kill (cannot be caught or ignored) */ + +#define SIGRTMIN 10 +#define SIGRTMAX 32 + +/* Notification Types. */ +/* No asynchronous notification is delivered when the event of interest occurs. */ +#define SIGEV_NONE 0 +/* The signal specified in sigev_signo shall be generated for the process when + the event of interest occurs. */ +#define SIGEV_SIGNAL 1 +/* A notification function is called to perform notification. */ +#define SIGEV_THREAD 2 +#define SA_SIGINFO 1 + +/* + * Flags for sigprocmask: + */ +#define SIG_BLOCK 1 /* block specified signal set */ +#define SIG_UNBLOCK 2 /* unblock specified signal set */ +#define SIG_SETMASK 3 /* set specified signal set */ + +typedef unsigned long int sigset_t; + +union sigval +{ + int sival_int; /* Integer signal value. */ + void *sival_ptr; /* Pointer signal value. */ +}; + +typedef struct sigevent sigevent; +struct sigevent +{ + int sigev_notify; /* Notification type. */ + int sigev_signo; /* Signal number. */ + union sigval sigev_value; /* Signal value. */ + void (*sigev_notify_function)(union sigval); /* Notification function. */ + pthread_attr_t *sigev_notify_attributes; +}; + +typedef struct siginfo_t siginfo_t; +struct siginfo_t +{ + int si_signo; + int si_code; + union sigval si_value; +/* int si_errno; + pid_t si_pid; + uid_t si_uid; + void *si_addr; + int si_status; + long si_band;*/ +}; +struct sigaction +{ + void (*sa_handler)(int); + sigset_t sa_mask; + int sa_flags; + void (*sa_sigaction)(int, siginfo_t *, void *); +}; + +/* Signal functions */ + +/** \details + * This provides POSIX Signal API. Please note that this + * implementation does not fully comply with POSIX standard. + * + * In POSIX standard, Signal can be used as 'interrupt', which means + * an incoming signal will interrupt a running thread. After the + * registered signal handler is executed, the thread will resume. + * This behavior cannot be implemented w/o modifying L4 or QURT kernel. + * On the ohter hand, appliation need to be carefully written to avoid + * problems caused by 'interrupting' signals. + * + * Therefore, in this implementation of POSIX signal, thread will + * only receive signals when it explicitly waits for signals, i.e., when + * the thread calls either sigwait() or sigsuspend(). + * + * Therefore, pthread_sigmask(), which set or get signal mask for a thread, + * is not supported, since the signal mask will be set by sigwait() and + * sigsuspend(). + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * only threads can send and receive signals. The functions related to + * signal operations with processes, such as kill(), sigqueue(), + * sigprocmask(), are not provided. + * + * Queued signal is not supported. + * + * Applications will use signals from SIGRTMIN to SIGRTMAX. + * + * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not + * supported. + * + */ + +/** \defgroup signal POSIX Signal API */ +/** \ingroup signal */ +/** @{ */ + +/** Wait for signals. This implementation does not support queued signals. + * + * Please refer to POSIX standard for details. + */ +int sigwait(const sigset_t *restrict set, int *restrict sig); + +/** Examine and Change Signal Action. + * Please refer to POSIX standard for details. + * + * @param act [in] A pointer to the sigaction structure that describes the + * action to be taken for the signal. Can be NULL. + * The following flags for sa_flags field in struct sigaction are not + * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, + * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported. + * + * @note Define sigaction as macro to avoid a warning when included from + * C++ code - it's causing a "sigaction(...) hides constructor for + * 'struct sigaction'" warning. + */ +/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */ +#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact)) + +/** Wait for signals. + * Please refer to POSIX standard for details. + */ +int sigsuspend(const sigset_t *sigmask); + +/** Add Signal to Signal Set. + * Please refer to POSIX standard for details. + */ +int sigaddset(sigset_t *set, int signo); + +/** Delete Signal from Signal Set. + * Please refer to POSIX standard for details. + */ +int sigdelset(sigset_t *set, int signo); + +/** Initialize and Empty Signal Set. + * Please refer to POSIX standard for details. + */ +int sigemptyset(sigset_t *set); + +/** Initialize and Fill Signal Set. + * Please refer to POSIX standard for details. + */ +int sigfillset(sigset_t *set); + +/** Test for Signal in Signal Set. + * Please refer to POSIX standard for details. + */ +int sigismember(const sigset_t *set, int signo); + +/** @} */ + +/* this is not a public api function */ +int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact); + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +/** Wait for the time interval specified in the timespec structure referenced + * by timeout. This implementation does not support queued signals. + * For struct siginfo_t, si_code and si_value are ignored in this implementation. + * + * Please refer to POSIX standard for details. + */ +int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, + const struct timespec *restrict timeout); + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SIGNAL_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/errno.h new file mode 100755 index 0000000000000..b9edf57bab6c3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/errno.h @@ -0,0 +1,20 @@ +#ifndef _SYS_ERRNO_H_ +#define _SYS_ERRNO_H_ + +/*========================================================================== + * FILE: errno.h + * + * SERVICES: POSIX errno header file + * + * DESCRIPTION: POSIX errno based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#ifndef EOK +#define EOK 0 +#endif + +#endif /* _SYS_ERRNO_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/sched.h new file mode 100755 index 0000000000000..2acc34d821725 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/sched.h @@ -0,0 +1,67 @@ +#ifndef _POSIX_SCHED_H_ +#define _POSIX_SCHED_H_ + +/*========================================================================== + * FILE: sched.c + * + * SERVICES: POSIX Thread sched API interface + * + * DESCRIPTION: POSIX Thread sched API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCHED_FIFO 0 /* First in, first out (FIFO) scheduling policy. */ +#define SCHED_RR 1 /* Round robin scheduling policy. */ +#define SCHED_SPORADIC 2 /* Sporadic server scheduling policy. */ +#define SCHED_OTHER 3 /* Another scheduling policy. */ + +typedef struct sched_param sched_param; +struct sched_param +{ + void *unimplemented; + int sched_priority; +}; + +/** \details + * This provides POSIX sched API. + */ + +/** \defgroup sched POSIX sched API */ +/** \ingroup sched */ +/** @{ */ + +/** Relinquish the CPU. + * Please refer to POSIX standard for details. + */ +static inline int sched_yield(void) +{ + return 0; +} + +/** Get the maximum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_max(int policy); + +/** Get the minimum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_min(int policy); + +/** @} */ +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SCHED_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/types.h new file mode 100755 index 0000000000000..700026f9f9e4e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/types.h @@ -0,0 +1,35 @@ +#ifndef _SYS_TYPES_H_ +#define _SYS_TYPES_H_ + +/*========================================================================== + * FILE: types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#if !defined( _PID_T ) || !defined( __pid_t_defined ) +/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header + defines it as unsigned 32-bit type citing conflict with QuRT POSIX + compatibility later. If any such conflicts exist, we should fix them. + pid_t is being defined *BEFORE* inclusion of generic/sys/types.h + *INTENTIONALLY* to fix this */ +typedef int pid_t; +#define _PID_T +#define __pid_t_defined +#endif +#include +#include +#include +#include + +#ifndef __DEFINED_off_t +typedef long off_t; +#define __DEFINED_off_t +#endif + +#endif /* _SYS_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/time.h new file mode 100755 index 0000000000000..13aeb1ea9920d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/time.h @@ -0,0 +1,142 @@ +#ifndef _POSIX_TIME_H_ +#define _POSIX_TIME_H_ + +/*========================================================================== + * FILE: time.h + * + * SERVICES: POSIX Timer API interface + * + * DESCRIPTION: POSIX Timer API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *==========================================================================*/ + + +#include + +typedef int clockid_t; /* ignored */ +#define _CLOCKID_T +#define _PROVIDE_POSIX_TIME_DECLS 1 +#include +/* @todo anandj sys/time.h has definition for struct timeval but is not + included by generic/time.h */ +#include + +#define CLOCK_FREQ_NOT_DEFINED -1 +/* Frequency of Sclk used */ +#define TIME_CONV_SCLK_FREQ 19200000 + +#define RES_CONV_FACTOR1 1 +#define RES_CONV_FACTOR2 1000000000 + +#if !defined(CLOCK_REALTIME) +# define CLOCK_REALTIME 0 +#endif + +#if !defined(CLOCK_MONOTONIC) +# define CLOCK_MONOTONIC 1 +#endif + +#if !defined(CLOCK_THREAD_CPUTIME_ID) +# define CLOCK_THREAD_CPUTIME_ID 2 +#endif + +#if !defined(CLOCK_PROCESS_CPUTIME_ID) +# define CLOCK_PROCESS_CPUTIME_ID 3 +#endif + +#if !defined(CLOCK_MONOTONIC_RAW) +# define CLOCK_MONOTONIC_RAW 4 +#endif + +#if !defined(CLOCK_REALTIME_COARSE) +# define CLOCK_REALTIME_COARSE 5 +#endif + +#if !defined(CLOCK_MONOTONIC_COARSE) +# define CLOCK_MONOTONIC_COARSE 6 +#endif + +#if !defined(CLOCK_BOOTTIME) +# define CLOCK_BOOTTIME 7 +#endif + +struct itimerspec +{ + struct timespec it_interval; /* Timer period. */ + struct timespec it_value; /* Timer expiration. */ +}; + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Timer functions */ + +/** \details + * POSIX timers can be either of two types: a one-shot type or a periodic + * type. + * + * A one-shot is an armed timer that is set to an expiration time relative + * to either a current time or an absolute time. The timer expires once and + * is disarmed. + * + * A periodic timer is armed with an initial expiration time and a repetition + * interval. Every time the interval timer + * expires, the timer is reloaded with the repetition interval. The timer + * is then rearmed. + */ + +/** \defgroup timer POSIX Timer API */ + +/** \ingroup timer */ +/** @{ */ + +/** Create a POSIX timer. + * Please refer to POSIX standard for details. + * @param clockid [in] ignored in this implementation + * @param evp [in] if non-NULL, points to a sigevent structure. This + * structure, allocated by the application, defines the asynchronous + * notification to occur when the timer expires. If the evp argument is + * NULL, the effect is as if the evp argument pointed to a sigevent + * structure with the sigev_notify member having the value SIGEV_SIGNAL, + * the sigev_signo having a default signal number (SIGALRM), and the + * sigev_value member having the value of the timer ID. + */ +int timer_create(clockid_t clockid, struct sigevent *restrict evp, + timer_t *restrict timerid); + +/** Delete a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_delete(timer_t timerid); + +/** Get the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_gettime(timer_t timerid, struct itimerspec *value); + + +/** Set the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + * @param flags [in] ignored in this implementation + */ +int timer_settime(timer_t timerid, int flags, + const struct itimerspec *restrict value, + struct itimerspec *restrict ovalue); +/** Obtain ID of a process CPU-time clock + * @param pid [in] Process ID + * @param clock_id [out] Clock ID + * @return Error values as per POSIX standard + */ +int clock_getcpuclockid (pid_t pid, clockid_t * clock_id); +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_TIME_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qube/qube.h new file mode 100755 index 0000000000000..1e31e2deedb38 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qube/qube.h @@ -0,0 +1,51 @@ +#ifndef QUBE_H +#define QUBE_H +/*============================================================================= + + qube.h -- H E A D E R F I L E + +GENERAL DESCRIPTION + Prototypes of qpd API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Define Error codes as QuRT error codes preceed with QURT_ */ +#ifndef EOK +#define EOK QURT_EOK +#endif /* EOK */ +#ifndef EVAL +#define EVAL QURT_EVAL +#endif /* EVAL */ +#ifndef EMEM +#define EMEM QURT_EMEM +#endif /* EMEM */ +#ifndef EINVALID +#define EINVALID QURT_EINVALID +#endif /* EINVALID */ + + +/*============================================================================= + FUNCTION DECLARATIONS +=============================================================================*/ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QUBE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops.h new file mode 100755 index 0000000000000..0a9a9f8ba7db5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops.h @@ -0,0 +1,197 @@ +#ifndef ATOMIC_OPS_H +#define ATOMIC_OPS_H +/** + @file atomic_ops.h + + @brief Type definitions backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * Author: Carlos Dyonisio + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned int atomic_plain_word_t; + +/*-------------------------------------------------------------------------*/ + /* Atomic Ops API. */ + +/* + * IMPORTANT! + * If you plan to change the structure atomic_word_t, please add the new + * elements after value. For more information, read the comment in + * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66 + */ + +typedef struct { + volatile atomic_plain_word_t value; +} atomic_word_t; + +#define ATOMIC_INIT(i) { (i) } + +static inline void +atomic_init(atomic_word_t *a, atomic_plain_word_t v) +{ + a->value = v; +} + +#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \ + (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP)) + +/* + * If it is ARMv4/v5, the function declarations may change + * and are defined in the arch specific header file, + * as some of then cannot be declared static because of + * the assembler implementation. + */ + +#else + +/* Arithmetic operations. */ + +void atomic_sub(atomic_word_t *target, atomic_plain_word_t v); + +/* Architecture independent definitions. */ + +static inline atomic_plain_word_t atomic_read(atomic_word_t *target) +{ + return target->value; +} + +typedef unsigned long long atomic64_plain_word_t; + +typedef struct { + volatile atomic64_plain_word_t value; +} atomic64_word_t; + +static inline void +atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v) +{ + a->value = v; +} + +/********************* + Support 64-bit + *********************/ + +atomic64_plain_word_t atomic64_set(atomic64_word_t* target, + atomic64_plain_word_t value); + +void atomic64_xor(atomic64_word_t* target, + atomic64_plain_word_t mask); + +/*---------------------------------------------------------------------------*/ + +/* Architecture independent definitions. */ + +static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target) +{ + return target->value; +} + +#endif + + +/* Architecture dependent definitions. */ +#include + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops_plat.h new file mode 100755 index 0000000000000..b54b3ff83d978 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops_plat.h @@ -0,0 +1,86 @@ +#ifndef ATOMIC_OPS_PLAT_H +#define ATOMIC_OPS_PLAT_H +/** + @file atomic_ops_plat.h + + @brief Prototypes of atomic operations API backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define atomic_set(a,b) qurt_atomic_set((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and(a,b) qurt_atomic_and((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and_return(a,b) qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or(a,b) qurt_atomic_or((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or_return(a,b) qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor(a,b) qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor_return(a,b) qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_set_bit(a,b) qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_clear_bit(a,b) qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_change_bit(a,b) qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add(a,b) qurt_atomic_add((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_return(a,b) qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_unless(a,b,c) qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_sub(a,b) qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b)) +#define atomic_sub_return(a,b) qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_inc(a) qurt_atomic_inc((unsigned int *)(a)) +#define atomic_inc_return(a) qurt_atomic_inc_return((unsigned int *)(a)) +#define atomic_dec(a) qurt_atomic_dec((unsigned int *)(a)) +#define atomic_dec_return(a) qurt_atomic_dec_return((unsigned int *)(a)) +#define atomic_compare_and_set(a,b,c) qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_barrier qurt_atomic_barrier +#define atomic_barrier_write qurt_atomic_barrier_write +#define atomic_barrier_write_smp qurt_atomic_barrier_write_smp +#define atomic_barrier_read_smp qurt_atomic_barrier_read_smp +#define atomic_barrier_smp qurt_atomic_barrier_smp + +/*============================ + * 64 bits support + *============================ */ +#define atomic64_set(a,b) qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and(a,b) qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and_return(a,b) qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or(a,b) qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or_return(a,b) qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor(a,b) qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor_return(a,b) qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_set_bit(a,b) qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_clear_bit(a,b) qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_change_bit(a,b) qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add(a,b) qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add_return(a,b) qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub(a,b) qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub_return(a,b) qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_inc(a) qurt_atomic64_inc((unsigned long long *)(a)) +#define atomic64_inc_return(a) qurt_atomic64_inc_return((unsigned long long *)(a)) +#define atomic64_dec(a) qurt_atomic64_dec((unsigned long long *)(a)) +#define atomic64_dec_return(a) qurt_atomic64_dec_return((unsigned long long *)(a)) +#define atomic64_compare_and_set(a,b,c) qurt_atomic64_compare_and_set((unsigned long long *)(a),(unsigned long long )(b),(unsigned long long )(c)) +#define atomic64_barrier qurt_atomic64_barrier +#define atomic64_barrier_write qurt_atomic64_barrier_write +#define atomic64_barrier_write_smp qurt_atomic64_barrier_write_smp +#define atomic64_barrier_read_smp qurt_atomic64_barrier_read_smp +#define atomic64_barrier_smp qurt_atomic64_barrier_smp + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_PLAT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt.h new file mode 100755 index 0000000000000..4d25c9b2b6243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt.h @@ -0,0 +1,111 @@ +#ifndef QURT_H +#define QURT_H + +/** + @file qurt.h + @brief Contains kernel header files that provide kernel OS API functions, constants, and + definitions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013,2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +/*====================================================================== + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Notice that changes are listed in reverse chronological + * order. + * + * + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------ + * 2011-02-25 op Add Header file + 2012-12-16 cm (Tech Pubs) Edited/added Doxygen comments and markup. + ======================================================================*/ + + +#ifdef __cplusplus +extern "C" { +#endif + +#include "qurt_consts.h" +#include "qurt_api_version.h" +#include "qurt_alloc.h" +#include "qurt_futex.h" +#include "qurt_mutex.h" +#include "qurt_pipe.h" +#include "qurt_printf.h" +#include "qurt_assert.h" +#include "qurt_thread.h" +#include "qurt_trace.h" +#include "qurt_cycles.h" +#include "qurt_profile.h" +#include "qurt_sem.h" +#include "qurt_cond.h" +#include "qurt_barrier.h" +#include "qurt_fastint.h" +#include "qurt_allsignal.h" +#include "qurt_anysignal.h" +#include "qurt_signal.h" +#include "qurt_rmutex.h" +#include "qurt_pimutex.h" +#include "qurt_signal2.h" +#include "qurt_rmutex2.h" +#include "qurt_pimutex2.h" +#include "qurt_int.h" +#include "qurt_lifo.h" +#include "qurt_power.h" +#include "qurt_event.h" +#include "qurt_pmu.h" +#include "qurt_stid.h" +//#include "qurt_version.h" +#include "qurt_tlb.h" +#include "qurt_vtlb.h" +#include "qurt_memory.h" +#include "qurt_qdi.h" +#include "qurt_sclk.h" +#include "qurt_space.h" +#include "qurt_process.h" +#include "qurt_timer.h" +#include "qurt_tls.h" +#include "qurt_thread_context.h" +#include "qurt_hvx.h" +#include "qurt_hmx.h" +#include "qurt_mailbox.h" +#include "qurt_island.h" +#include "qurt_qdi_proxy.h" +#include "qurt_l2cfg.h" +#include "qurt_mmap.h" +#include "qurt_isr.h" +#include "qurt_busywait.h" +#include "qurt_ecc.h" +#include "qurt_callback.h" +#include "qurt_error.h" +#include "qurt_except.h" +#include "qurt_mq.h" +#include "qurt_user_dma.h" +#include "qurt_fs_hub.h" +#include "qurt_os_services.h" + +#ifndef MAIN_ONLY +#define INCLUDE_ISLAND_CONTENTS +#endif +#ifndef ISLAND_ONLY +#define INCLUDE_MAIN_CONTENTS +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_alloc.h new file mode 100755 index 0000000000000..da37a4c0a714e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_alloc.h @@ -0,0 +1,145 @@ +#ifndef QURT_ALLOC_H +#define QURT_ALLOC_H + +/** + @file qurt_alloc.h + @brief Prototypes of kernel memory allocation API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_malloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated memory area. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] size Size (in bytes) of the memory area. + + @return + Nonzero -- Pointer to the allocated memory area. \n + 0 -- Not enough memory in heap to allocate memory area. + + @dependencies + None. + + */ +/* ======================================================================*/ +void *qurt_malloc( unsigned int size); + +/*======================================================================*/ +/**@ingroup func_qurt_calloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated array. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] elsize Size (in bytes) of each array element. + @param[in] num Number of array elements. + + @return + Nonzero -- Pointer to allocated array.\n + Zero -- Not enough memory in heap to allocate array. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_calloc(unsigned int elsize, unsigned int num); + +/*======================================================================*/ +/**@ingroup func_qurt_realloc + Reallocates memory on the heap. \n + Changes the size of a memory area that is already allocated on the QuRT system heap. + The reallocate memory operation is functionally similar to realloc. It accepts a pointer + to an existing memory area on the heap, and resizes the memory area to the specified size + while preserving the original contents of the memory area. + + @note1hang This function might change the address of the memory area. + If the value of ptr is NULL, this function is equivalent to + qurt_malloc(). + If the value of new_size is 0, it is equivalent to qurt_free(). + If the memory area is expanded, the added memory is not initialized. + + @param[in] *ptr Pointer to the address of the memory area. + @param[in] newsize Size (in bytes) of the reallocated memory area. + + @return + Nonzero -- Pointer to reallocated memory area. \n + 0 -- Not enough memory in heap to reallocate the memory area. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_realloc(void *ptr, int newsize); + +/*======================================================================*/ +/**@ingroup func_qurt_free + Frees allocated memory from the heap.\n + Deallocates the specified memory from the QuRT system heap. + + @param[in] *ptr Pointer to the address of the memory to deallocate. + + @return + None. + + @dependencies + The memory item that the ptr value specifies must have been previously + allocated using one of the qurt_calloc(), + qurt_malloc(), or qurt_realloc() memory allocation functions. + Otherwise the behavior of QuRT is undefined. + + */ + /* ======================================================================*/ +void qurt_free( void *ptr); + + +void *qurt_memalign(unsigned int alignment, unsigned int size); + +/* +|| Macro to define a static heap for a QuRT program. +|| +|| Usage: +|| Declare at the top-level of any C source file that +|| is part of the build (and is guaranteed +|| to actually be pulled into the build). Place +|| it in the same function with main(): +|| +|| QURT_DECLARE_STATIC_HEAP(512000); +|| +|| The only argument is the size in bytes, and it is +|| rounded up to the nearest 64 bytes (size of an +|| L2 cache block). +|| +*/ + +#define QURT_DECLARE_STATIC_HEAP(sz) \ + static struct qurt_static_heap { \ + char space[(sz)] __attribute__((aligned(64))); \ + } static_heap[1]; \ + void * const override_heap_Base = &static_heap[0]; \ + void * const override_heap_Limit = &static_heap[1] + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLOC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_allsignal.h new file mode 100755 index 0000000000000..5dc89e495130d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_allsignal.h @@ -0,0 +1,176 @@ + +#ifndef QURT_ALLSIGNAL_H +#define QURT_ALLSIGNAL_H + +/** + @file qurt_allsignal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup all_signal_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** +qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int waiting; /**< */ + unsigned int signals_in; /**< */ + unsigned int queue; /**< */ + unsigned int reserved; /**< */ + }X; + /** @endcond */ +} qurt_allsignal_t; +/** @} */ /* end_addtogroup all_signal_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_init + Initializes an all-signal object.\n + The all-signal object is initially cleared. + + @datatypes + #qurt_allsignal_t + + @param[out] signal Pointer to the all-signal object to initialize. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_init(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_destroy + Destroys the specified all-signal object.\n + @note1hang All-signal objects must be destroyed when they are no longer in use. + Failure to do this causes resource leaks in the QuRT kernel. \n + @note1cont All-signal objects must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_destroy(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_get + Gets signal values from the all-signal object. + + Returns the current signal values of the specified all-signal object. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to access. + + @return + Bitmask with current signal values. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal) +{ return signal->X.signals_in; } + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_wait + Waits on the all-signal object.\n + Suspends the current thread until all of the specified signals are set. + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 that it is not to be waited on. + + If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal + object before waiting on them again -- clearing is done automatically by the wait + operation. + + @note1hang At most, one thread can wait on an all-signal object at any given time. + Because signal clearing is done by the wait operation, no clear operation is + defined for all-signals. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to wait on. + @param[in] mask Signal mask value, which identifies the individual signals in the all-signal object + to wait on. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_set + Set signals in the specified all-signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit + value of 1 indicates that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the all-signal object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLSIGNAL_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_anysignal.h new file mode 100755 index 0000000000000..9619e2de562b4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_anysignal.h @@ -0,0 +1,225 @@ +#ifndef QURT_ANYSIGNAL_H +#define QURT_ANYSIGNAL_H +/** + @file qurt_anysignal.h + Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +Copyright (c) 2021 Qualcomm Technologies, Inc. +All rights reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== +Typedefs +======================================================================*/ + +/**@ingroup anysignals_types + qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility. */ +typedef qurt_signal_t qurt_anysignal_t; + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_init + Initializes an any-signal object.\n + The any-signal object is initially cleared. + + @datatypes + #qurt_anysignal_t + + @param[out] signal Pointer to the initialized any-signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_init(qurt_anysignal_t *signal) +{ + qurt_signal_init(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_destroy + Destroys the specified any-signal object. + + @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Any-signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal) +{ + qurt_signal_destroy(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait + Wait on the any-signal object. \n + Suspends the current thread until any one of the specified signals is set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + + @return + Bitmask of current signal values. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_set + Sets signals in the specified any-signal object. \n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be set, and 0 indicates not to set the sigmal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the any-signal object. + + @return + Bitmask of old signal values (before set). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask); + + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_get + Gets signal values from the any-signal object.\n + Returns the current signal values of the specified any-signal object. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to access. + + @return + A bitmask with the current signal values of the specified any-signal object. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal) +{ + return qurt_signal_get(signal); +} + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_clear + @xreflabel{sec:anysignal_clear} + Clears signals in the specified any-signal object.\n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + clear in the any-signal object. + + @return + Bitmask -- Old signal values (before clear). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait_timed + Waits on the any-signal object. \n + Suspends the current thread until any of the specified signals is set or timeout expires. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + @param[out] signals Bitmask of current signal values. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- timeout + #QURT_EINVALID -- Duration out of range + + @dependencies + None. + */ +/* ======================================================================*/ + +int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ANYSIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_api_version.h new file mode 100755 index 0000000000000..dfe53ae755054 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_api_version.h @@ -0,0 +1,77 @@ +#ifndef QURT_API_VERSION_H +#define QURT_API_VERSION_H +/*============================================================================== + +qurt_api_version.h + +GENERAL DESCRIPTION + API version file + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +/*============================================================================== + CONSTANTS AND DEFINITIONS +==============================================================================*/ +/** + * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer. + * Main release has first 3 fields updated - Major, Minor and Release. + * - QURT_API_VERSION = Major, Minor, Release. + * Patch releases are supported by adding the extra field. + * - QURT_API_VERSION = Major, Minor, Release, Patch. + */ +// Major version is incremented for incompatible API changes. +#define QURT_API_VER_MAJOR 1 + +// Minor version is incremented for backward-compatible enhancements in the API +// set. +#define QURT_API_VER_MINOR 4 + +// RELEASE version is incremented for each release within a `MAJOR.MINOR` +// release. +#define QURT_API_VER_RELEASE 1 + +// Patch version is incremented when new API content is introduced on older LTS +// release. +#define QURT_API_VER_PATCH 0 + +/* Update the QURT_API_VERSION function macro. */ +#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \ + ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \ + (((release) & 0xFF) << 8) | ((patch) & 0xFF)) + +/* Update the QURT_API_VERSION Macro. */ +#define QURT_API_VERSION \ + QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \ + QURT_API_VER_RELEASE, QURT_API_VER_PATCH) + +/** Usage: + * + * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0) + * qurt_func_2(a,b,c); + * #else + * qurt_func(a); + * #endif + * + */ +/* + Gets the QuRT API version. + + @return + QuRT API version. + + @dependencies + None. + */ +unsigned int qurt_api_version(void); + +#endif /* QURT_API_VERSION_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_assert.h new file mode 100755 index 0000000000000..13cc2afd2e973 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_assert.h @@ -0,0 +1,51 @@ +#ifndef QURT_ASSERT_H +#define QURT_ASSERT_H +/** + @file qurt_assert.h + @brief Prototypes of qurt_assert API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@ingroup func_qurt_assert_error + Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel. + + @datatypes + None. + + @param[in] filename Pointer to the file name string. + @param[in] lineno Line number. + + @return + None. + + @dependencies + None. + */ +void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn)); + +#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__)) + +/** @} */ /* end_ingroup func_qurt_assert */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ASSERT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_atomic_ops.h new file mode 100755 index 0000000000000..d9b2cff7d737c --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_atomic_ops.h @@ -0,0 +1,1298 @@ +#ifndef QURT_ATOMIC_OPS_H +#define QURT_ATOMIC_OPS_H +/** + @file qurt_atomic_ops.h + @brief Prototypes of kernel atomic operations API. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * + * This file is only included by the main atomic_ops.h, so all of that + * file's definitions are available. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +///* Sanity check to ensure the smp flag is set in machines.py */ +//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1 +//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py. +//#endif +#define QURT_INLINE __attribute__((always_inline)) + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_atomic_set + Sets the atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value Value to set. + + @return + Value successfuly set. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_set(unsigned int* target, unsigned int value) +{ + unsigned long tmp; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " memw_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic_and + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + None + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_and(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_and_return + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + AND result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_and_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_or + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_or(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_or_return + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + Returns the OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_or_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_xor + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_xor(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_xor_return + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_xor_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_set_bit + Sets a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_set_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_clear_bit + Clears a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_clear_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_change_bit + Toggles a bit in a atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_change_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1fU; + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_add(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add_return + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_add_unless + Adds the delta value to an atomic variable unless the current value in the target + matches the unless variable. + + @note1hang The function retries until load lock and store conditional + are successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] delta Value to add to the current value. + @param[in] unless Perform the addition only when the current value is not + equal to this unless value. + @return + TRUE -- 1 - Addition was performed. \n + FALSE -- 0 - Addition was not done. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_unless(unsigned int* target, + unsigned int delta, + unsigned int unless) +{ + unsigned int current_val; + unsigned int new_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%3)\n" + " p0 = cmp.eq(%0, %5)\n" + " if p0 jump 2f\n" + " %1 = add(%0, %4)\n" + " memw_locked(%3, p0) = %1\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"=&r" (new_val),"+m" (*target) + : "r" (target), "r" (delta), "r" (unless) + : "p0"); + + return (unsigned int)(current_val != unless); +} + +/**@ingroup func_qurt_atomic_sub + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_sub(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_sub_return + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_sub_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_inc + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_inc(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_inc_return + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_inc_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_dec + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_dec(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_dec_return + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_dec_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_compare_and_set + Compares the current value of the atomic variable with the + specified value and set to a new value when compare is successful. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val Old value to compare. + @param[in] new_val New value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE --Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_compare_and_set(unsigned int* target, + unsigned int old_val, + unsigned int new_val) +{ + unsigned int current_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memw_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (unsigned int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic_barrier + Allows the compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_barrier(void) +{ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); +} + + +/**@ingroup func_qurt_atomic64_set + Sets the 64-bit atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value 64-bit value to set. + + @return + Successfuly set value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_set(unsigned long long* target, unsigned long long value) +{ + unsigned long long tmp; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " memd_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic64_and_return + Bitwise AND operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise AND. + + @return + AND result of 64-bit atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_or + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_or(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_or_return + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_xor_return + Bitwise XOR operation of 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_set_bit + Sets a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_clear_bit + Clears a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_change_bit + Toggles a bit in a 64-bit atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_add(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add_return + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_add_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_sub_return + Subtracts a 64-bit integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_inc + Increments a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_inc(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_inc_return + Increments a 64-bit atomic variable by one + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_inc_return(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_dec_return + Decrements a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_dec_return(unsigned long long *target) +{ + unsigned long long result; + long long minus1 = 0xFFFFFFFFFFFFFFFFLL; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (minus1) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_compare_and_set + Compares the current value of an 64-bit atomic variable with + the specified value and sets to a new value when compare is successful. + + @note1hang The function keep retrying until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val 64-bit old value to compare. + @param[in] new_val 64-bit new value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE -- Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE int +qurt_atomic64_compare_and_set(unsigned long long *target, + unsigned long long old_val, + unsigned long long new_val) +{ + unsigned long long current_val; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memd_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic64_barrier + Allows compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_barrier(void) +{ + /** @cond */ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); + /** @endcond */ +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_barrier.h new file mode 100755 index 0000000000000..7c6f787d43bc2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_barrier.h @@ -0,0 +1,140 @@ +#ifndef QURT_BARRIER_H +#define QURT_BARRIER_H + +/** + @file qurt_barrier.h + @brief Prototypes of Kernel barrier API functions. + + EXTERNALIZED FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup barrier_types +@{ */ +/*===================================================================== + Constants and macros +======================================================================*/ +#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */ +#define QURT_BARRIER_OTHER 0 /**< Other. */ + +#ifndef ASM +#include + +/*===================================================================== +Typedefs +======================================================================*/ + +/** QuRT barrier type. + */ +typedef union { + /** @cond */ + struct { + unsigned short threads_left; + unsigned short count; + unsigned int threads_total; + unsigned int queue; + unsigned int reserved; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_barrier_t; + +/** @} */ /* end_addtogroup barrier_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_init + Initializes a barrier object. + + @datatypes + #qurt_barrier_t + + @param[out] barrier Pointer to the barrier object to initialize. + @param[in] threads_total Total number of threads to synchronize on the barrier. + + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_destroy + Destroys the specified barrier. + + @note1hang Barriers must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Barriers must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to destroy. + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_destroy(qurt_barrier_t *barrier); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_wait + Waits on the barrier.\n + Suspends the current thread on the specified barrier. \n + The function return value indicates whether the thread was the last one to + synchronize on the barrier. + When a thread waits on a barrier, it is suspended on the barrier: \n + - If the total number of threads waiting on the barrier is less than the assigned value + of the barrier, no other action occurs. \n + - If the total number of threads waiting on the barrier equals the assigned value of the + barrier, all threads currently waiting on the barrier are awakened, allowing them to + execute past the barrier. + + @note1hang After its waiting threads are awakened, a barrier is automatically reset + and can be used again in the program without the need for re-initialization. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to wait on. + + @return + #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n + #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_wait(qurt_barrier_t *barrier); + + +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BARRIER_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_busywait.h new file mode 100755 index 0000000000000..a4dab80a2520a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_busywait.h @@ -0,0 +1,62 @@ +#ifndef QURT_BUSYWAIT_H +#define QURT_BUSYWAIT_H + +/** + @file qurt_busywait.h + @brief Implementation of the busywait() function for + hardware based blocking waits that use the QTIMER as a reference. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ============================================================================*/ +/*============================================================================= + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Changes are listed in reverse chronological + * order. + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------------- + * 2018-03-20 pg Add Header file + ============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_busywait + Pauses the execution of a thread for a specified time.\n + Use for small microsecond delays. + + @note1hang The function does not return to the caller until + the time duration has expired. + + @param[in] pause_time_us Time to pause in microseconds. + + @return + None. + + @dependencies + None. + */ +void qurt_busywait (unsigned int pause_time_us); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BUSYWAIT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_callback.h new file mode 100755 index 0000000000000..dc9b896c63454 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_callback.h @@ -0,0 +1,235 @@ +#ifndef QURT_CALLBACK_H +#define QURT_CALLBACK_H + +/** + @file qurt_callback.h + Definitions, macros, and prototypes for QuRT callback framework. + + QDI framework allows the development of root process drivers and services that + a user process client can interact with in a secure manner. QDI framework does + this by elevating the priviledge of user process thread, temporarily allowing + the thread execute in root context and letting it fall back to user context once + the QDI invocation is finished. + + The QuRT callback framework provides a safe mechanism for root process drivers + to execute callback functions in a user process. The framework hosts + dedicated worker threads in corresponding processes that handle the execution + of the callback function. This ensures that the callbacks occur in context of + the appropriate process thread, in result maintaining privilege boundaries. + + Prerequisites for use of this framework are: + 1. Driver is a QDI driver and client communicates with drivers using QDI + invocations. + 2. Appropriate callback configuration is specified in cust_config.xml for + the user process that intends to use this framework. + + qurt_cb_data_t is the public data structure that allows client to store all + the required information about the callback, including the callback function + and the arguments to pass to this function when it executes. + The client uses QDI interface to register this structure with root driver. + + Callback framework provides following APIs that a root driver can use to invoke callback. + These functions are described in qurt_qdi_driver.h header file. + + qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the + invoking thread does not wait for the callback to finish executing. + + qurt_qdi_cb_invoke_sync() triggers a synchronous callback. Upon invocation + the invoking thread gets suspended till the callback function finishes execution. + + qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to + qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with + the callback invocation to be utlized during the callback execution. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int qurt_cb_result_t; + +/* Callback framework error codes. + Callback framework returns a nonzero value if callback invocation is unsuccessful. + Following macros highlight cause of failure in more detail. +*/ +#define QURT_CB_ERROR -1 /* Callback registration failed.\n*/ +#define QURT_CB_OK 0 /* Success.\n*/ +#define QURT_CB_MALLOC_FAILED -2 /* QuRTOS malloc failure.\n*/ +#define QURT_CB_WAIT_CANCEL -3 /* Process exit cancelled wait operation.\n*/ +#define QURT_CB_CONFIG_NOT_FOUND -4 /* Callback configuration for process was not found.\n*/ +#define QURT_CB_QUEUE_FULL -5 /* Callback queue is serving at maximum capacity.*/ +/** @addtogroup cb_types +@{ */ +/** Callback registration data structure. + This data structure is used by a client attempting to register a callback with a QDI driver. + It holds the address of callback function and the argument supplied to the callback + function when it executes. +*/ +typedef struct { + /** @cond */ + void* cb_func; /*< Pointer to the callback function. */ + unsigned cb_arg; /*< Not interpreted by the framework.*/ + /** @endcond */ +} qurt_cb_data_t; + +/** @cond */ +/* Defines used as default if cust_config does not specify them. */ +#define CALLBACK_WORKER_STACK_SIZE 0x2000 +/** @endcond */ +/** @} */ /* end_addtogroup cb_typess */ +/**@ingroup func_qurt_cb_data_init + Initializes the callback data structure. + Entity registering a callback with the root process driver must call this function + to initialize callback registration data structure to the default value. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){ + cb_data->cb_func = NULL; + cb_data->cb_arg = 0; +} + +/**@ingroup func_qurt_cb_data_set_cbfunc + Sets up the callback function in the callback registration data structure. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_func Pointer to the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){ + cb_data->cb_func = cb_func; +} + +/**@ingroup func_qurt_cb_data_set_cbarg + Sets up the callback argument. + This function sets up the argument passed to the callback function when it executes. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_arg Argument for the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){ + cb_data->cb_arg = cb_arg; +} + +/** @cond */ +/**@ingroup driver_support_functions + Invokes an asynchronous callback for a specified process. + A driver that resides in the root process calls this API to launch a callback in + a process described by the client_handle. + After the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is not suspended during the callback execution period. + The API returns immediately with a success/failure error code. + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process. + A driver that resides in a root process calls this API to launch a sync callback in + a process described by the client_handle. + AFter the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is suspended during the callback execution period. + If the process in which to execute the callback exits or terminates, the caller is + woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h). + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process, passing driver data to the user PD. + This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to + the user process as part of the callback invocation. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @param data Driver arbitrary data to pass to the user process. Memory pointed to by data + must be accessible to the user PD. The root driver can allocate such memory by + using qurt_mem_mmap(). + @param data_len Driver arbitrary data length. + + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle, + qurt_cb_data_t* cb_data, + int prio, + void *data, + unsigned data_len + ); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_clade.h new file mode 100755 index 0000000000000..d7442cf98dd94 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_clade.h @@ -0,0 +1,62 @@ +#ifndef QURT_CLADE_H +#define QURT_CLADE_H +/** + @file qurt_clade.h + @brief Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API. + CLADE is a cache line level memory compression system that is used to + decrease DRAM usage. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_clade2_get + Reads the value of the clade2 register. + + @param[in] offset Offset from the clade2 cfg base. + @param[out] *value Pointer to the register value read from the offset. + + @return + #QURT_EOK - Successfully read the value from the register at offset \n + #QURT_EINVALID - Offset passed is incorrect + + @dependencies + None. + */ +int qurt_clade2_get(unsigned short offset, unsigned int *value); + +/**@ingroup func_qurt_clade2_set + Sets the PMU register; only PMU_SEL register can be set. + + @param[in] offset Offset from the QURTK_clade2_cfg_base. + @param[in] value Value to set at offset. + + @return + #QURT_EOK -- Successfully set the value at offset. \n + #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG. + + @dependencies + None. + */ +int qurt_clade2_set(unsigned short offset, unsigned int value); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CLADE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cond.h new file mode 100755 index 0000000000000..6e65ed82a8393 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cond.h @@ -0,0 +1,219 @@ +#ifndef QURT_COND_H +#define QURT_COND_H +/** + @file qurt_cond.h + @brief Prototypes of kernel condition variable object API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup condition_variables_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** QuRT condition variable type. */ +typedef union { + /** @cond */ + unsigned long long raw; + struct { + unsigned int count; + unsigned int n_waiting; + unsigned int queue; + unsigned int reserved; + }X; + /** @endcond */ +} qurt_cond_t; + +/** @} */ /* end_addtogroup condition_variables_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_cond_init + Initializes a conditional variable object. + + @datatypes + #qurt_cond_t + + @param[out] cond Pointer to the initialized condition variable object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_init(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_destroy + Destroys the specified condition variable. + + @note1hang Conditions must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Conditions must not be destroyed while they are still in use. If this occurs, + the behavior of QuRT is undefined. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to destroy. + + @return + None. + + */ +/* ======================================================================*/ +void qurt_cond_destroy(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_signal + Signals a waiting thread that the specified condition is true. \n + + When a thread wishes to signal that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the signal condition operation. \n + -# Unlock the mutex. + + @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_signal(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_broadcast + Signals multiple waiting threads that the specified condition is true.\n + When a thread wishes to broadcast that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the broadcast condition operation. \n + -# Unlock the mutex.\n + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_broadcast(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable (suspends the thread and unlocks the mutex). + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t \n + #qurt_mutex_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait2 + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable, which suspends the thread and unlocks the mutex. + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @note1cont This is the same API as qurt_cond_wait(), use this version + when using mutexes of type #qurt_rmutex2_t. + + @datatypes + #qurt_cond_t \n + #qurt_rmutex2_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with the condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_COND_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_consts.h new file mode 100755 index 0000000000000..b1e35998e73b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_consts.h @@ -0,0 +1,315 @@ +#ifndef QURT_CONSTS_H +#define QURT_CONSTS_H + +/** + @file qurt_consts.h + @brief QuRT constants and definitions + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* Definitions of system events. System events suspend + a thread and put it into suspending_list. + The system event number is saved in CONTEXT::error::cause field + of the suspended thread. An event handler thread such as + page fault handler or system error handler can wake up the suspended + thread. + */ +#define QURT_EVENT_PAGEFAULT 0x1 /* Page fault event. */ +#define QURT_EVENT_SYSTEM_ERR 0x2 /* System error event. */ +#define QURT_EVENT_SUSPEND 0x3 +#define QURT_EVENT_PROCESS_EXIT 0x4 /* Process termination event.*/ + +#define QURT_SYSENV_MAX_THREADS_TYPE 1 /* Maximum threads object. */ +#define QURT_SYSENV_PROCNAME_TYPE 2 /* Process name object. */ +#define QURT_SYSENV_MAX_PI_PRIO_TYPE 3 /* Maximum pi priority object. */ +#define QURT_SYSENV_ARCH_REV_TYPE 4 /* Architecture version object. */ +#define QURT_SYSENV_APP_HEAP_TYPE 5 /* Application heap object. */ +#define QURT_SYSENV_REGION_ATTR_DEFAULT 7 /* Default region attributes. */ +#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE 8 /* Stack profile count type. */ +#define QURT_SYSENV_ISLAND_CONFIG_TYPE 9 /*island configuration check*/ +#define QURT_SYSENV_HTHREADS_TYPE 10 /* Active threads objec */ +#define QURT_SYSENV_CONFIG_IMAGE_START_LO 11 /* Config image start address for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_START_HI 12 /* Config Image start address for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_LO 13 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_HI 14 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_PLATPARAMS 15 /* Platformparams for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_SIZE 16 /* Config image Size for DTB parsing */ +#define QURT_SYSENV_L2_CACHE_LINE_SIZE 17 /*L2 cache line size*/ + +/* Get q6 regs */ +#define QURT_GET_SSR 1 +#define QURT_GET_CCR 2 +#define QURT_GET_CFGBASE 3 +#define QURT_GET_SYSCFG 4 +#define QURT_GET_REV 5 + + +/** @cond rest_reg_dist */ +/** @addtogroup performance_monitor_macros +@{ */ + +/* PMU */ +#define QURT_PMUCNT0 0 /**< */ +#define QURT_PMUCNT1 1 /**< */ +#define QURT_PMUCNT2 2 /**< */ +#define QURT_PMUCNT3 3 /**< */ +#define QURT_PMUCFG 4 /**< */ +#define QURT_PMUEVTCFG 5 /**< */ + +/* new since V55 */ +#define QURT_PMUCNT4 6 /**< */ +#define QURT_PMUCNT5 7 /**< */ +#define QURT_PMUCNT6 8 /**< */ +#define QURT_PMUCNT7 9 /**< */ +#define QURT_PMUEVTCFG1 10 /**< */ + +/* new since V61 */ +#define QURT_PMUSTID0 11 /**< */ +#define QURT_PMUSTID1 12 /**< */ + +#define QURT_PMUCNTSTID0 13 /**< */ +#define QURT_PMUCNTSTID1 14 /**< */ +#define QURT_PMUCNTSTID2 15 /**< */ +#define QURT_PMUCNTSTID3 16 /**< */ +#define QURT_PMUCNTSTID4 17 /**< */ +#define QURT_PMUCNTSTID5 18 /**< */ +#define QURT_PMUCNTSTID6 19 /**< */ +#define QURT_PMUCNTSTID7 20 /**< */ + +/** @} */ /* end_addtogroup performance_monitor_macros */ +/** @endcond */ + +/* + Power collapse operation +*/ +#define QURT_POWER_SHUTDOWN 0 /**< */ +#define QURT_TCXO_SHUTDOWN 1 /**< */ +#define QURT_POWER_CMD_PREPARE 0 /**< */ +#define QURT_POWER_CMD_PERFORM 1 /**< */ +#define QURT_POWER_CMD_EXIT 2 /**< */ +#define QURT_POWER_CMD_FAIL_EXIT 3 /**< */ +#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */ +#define QURT_POWER_CMD_PERFORM_SAVE_TCM 5 /**< */ +#define QURT_POWER_CMD_DEEP_SLEEP 6 /**< */ + + +/** @addtogroup thread_macros +@{ */ +#define QURT_MAX_HTHREAD_LIMIT 8U /**< Limit on the maximum number of hardware threads supported by QuRT for any + Hexagon version. Use this definition to define arrays, and so on, in + target independent code. */ +/** @} */ /* end_addtogroup thread_macros */ + +/** @cond internal_only */ +/** @addtogroup power_management_macros +@{ */ +/** + L2 cache retention mode +*/ +#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_L2RET QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */ +/** @} */ /* end_addtogroup power_management_macros */ +/** @endcond */ + +/* + QURT_system_state + Use for debugging the shutdown/startup process. + + State transition for cold boot: + QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT --> + QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE --> + QURT_CBOOT_ROOT_TASK_STARTED + + State transition for power collapse: + QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND --> + QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC --> + cache flush states (dependent on L2 retention config) + + State transition for warm boot: + QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB --> + QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT +*/ +#define QURT_PREPARE_SINGLE_MODE 1 /**< */ +#define QURT_PREPARE_END 2 /**< */ +#define QURT_PERFORM_IPEND 3 /**< */ +#define QURT_PERFORM_SAVE_ISDP 4 /**< */ +#define QURT_PERFORM_SAVE_PMU 5 /**< */ +#define QURT_PERFORM_SAVE_TLB 6 /**< */ +#define QURT_PERFORM_SWITCH_PC 7 /**< */ +#define QURT_PERFORM_EXIT 8 /**< */ +#define QURT_FLUSH_L1CACHE 9 /**< */ +#define QURT_FLUSH_L2CACHE 0xA /**< */ +#define QURT_FLUSH_CACHE_DONE 0xB /**< */ +#define QURT_SWITCH_PC_DONE 0xC /**< */ +#define QURT_BOOT_SETUP_ISDB 0xD /**< */ +#define QURT_WBOOT_INIT_TLB 0xE /**< */ +#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */ +#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */ +#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */ +#define QURT_CBOOT_BSP_INIT 0x12 /**< */ +#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */ +#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */ +#define QURT_CBOOT_END_OS_INIT 0x15 /**< */ +#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */ +#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */ +#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */ +#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */ +#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */ +#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */ +#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */ +#define QURT_WBOOT_DEBUG_L2_END 0x1D /**< */ +#define QURT_NMI_SAVE_L2VIC_COMPLETE 0x1E /**< */ +#define QURT_NMI_HANDLER_COMPLETE 0x1F /**< */ +#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */ +#define QURT_WBOOT_START 0x21 /**< */ +#define QURT_ENTER_ISLAND 0x22 /**< */ +#define QURT_EXIT_ISLAND 0x23 /**< */ +#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */ +#define QURT_ABNORMAL_RESET 0x25 /**< */ +/* + Thread attributes +*/ + +#define QURT_THREAD_ATTR_GP 0x00000002 /*< */ +#define QURT_THREAD_ATTR_UGP 0x00000003 /*< User general pointer (UGP)*/ +#define QURT_THREAD_ATTR_PREFETCH 0x00000004 /*< */ +#define QURT_THREAD_ATTR_TID 0x00000005 /*< */ +#define QURT_THREAD_ATTR_CACHE_PART 0x00000007 /*< */ +#define QURT_THREAD_ATTR_COPROCESSOR 0x00000008 /*< */ +#define QURT_THREAD_ATTR_GET_L2CACHE_PART 0x00000009 /*< */ +#define QURT_THREAD_ATTR_SET_FRML 0x0000000A /*< */ +#define QURT_THREAD_ATTR_STID_GET 0x0000000B /*< */ +#define QURT_THREAD_ATTR_STID_SET 0x0000000C /*< */ +#define QURT_THREAD_ATTR_AUTOSTACK 0x0000000D /*< */ +#define QURT_THREAD_ATTR_SYSTEM_THREAD 0x0000000E /*< */ +#define QURT_THREAD_ATTR_STID_SET2 0x0000000F /*< */ +#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */ +#define QURT_THREAD_ATTR_STID_GET2 0x00000011 /*< */ + +/** Cache operations*/ +#define QURT_DCCLEAN 0U /* Clean Dcache. */ +#define QURT_DCINV 1U /* Invalidate Dcache. */ +#define QURT_DCCLEANINV 2U /* Clean and invalidate Dcache. */ +#define QURT_ICINV 3U /* Invalidate Icache. */ +#define QURT_DUMP_DCTAGS 4U /* For testing purpose. */ +#define QURT_FLUSH_ALL 5U /* Flush entire L1 and L2 cache. */ +#define QURT_TABLE_FLUSH 6U /* Flush based on table of physical pages */ +#define QURT_CLEAN_INVALIDATE_ALL 7U /* Flush and invalidate entire L1 and L2 cache. */ +#define QURT_L2CACHE_LOCK_LINES 8U /* l2 cache lock lines */ +#define QURT_L2CACHE_UNLOCK_LINES 9U /* l2 cache unlock lines */ +#define QURT_CLEAN 10U /* Flush L1 and L2 cache */ +#define QURT_CLEAN_INVALIDATE 11U /* Flush and invalidate L1 and L2 cache. */ +#define QURT_CLEAN_INVALIDATE_L2 12U /* Flush and invalidate entire L2 cache. */ + +/**@ingroup chapter_prefined_symbols */ +/**@xreflabel{hdr:QURT_API_VERSION}*/ + + +/* Process state. */ +#define QURT_UPDATE_PROCESS_STATE 0 /**< */ +#define QURT_MP_INIT 1 /*< */ +#define QURT_MP_RUNNING 2 /*< */ +#define QURT_MP_STOPPED 3 /*< */ + +/* QuRT reset reason. */ +#define QURT_NORMAL_BOOT 0 /* Normal boot. */ +#define QURT_WARM_BOOT 1 /* Power collapse warm boot. */ +#define QURT_WARM_BOOT_L2_RETENTION 2 /* Power collapse with L2 retention warm boot. */ +#define QURT_WARM_BOOT_SAVE_TCM 3 /* Power collapse with saving TCM. */ +#define QURT_QUICK_BOOT 4 /* Deep sleep. */ + +/* QuRT Wait for Idle command */ +#define QURT_WAIT_FOR_IDLE_DISABLE 0 /*< */ +#define QURT_WAIT_FOR_IDLE_ENABLE 1 /*< */ +#define QURT_WAIT_FOR_IDLE 2 /*< */ +#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */ + +/*QuRT island exit stages */ +#define QURT_ISLAND_EXIT_STAGE1 1 /*< */ +#define QURT_ISLAND_EXIT_STAGE2 2 /*< */ + +#define QURT_MAX_NAME_LEN 64 /*< */ + +#define MAX_POOL_RANGES 16 /*< */ + +/* key definitions for debug thread info */ +//#define MAX_TCB_KEY 40 //whatever is a good number or makes debug thread structure be 1K +#define KEY_SCHDULER_STATE 1 /*< */ +#define KEY_PRIORITY 2 /*< */ +#define KEY_PRIORITY_ORIG 3 /*< */ +#define KEY_STACK_BOTTOM 4 // Currently not populated +#define KEY_STACK_TOP 5 // Currently not populated +#define KEY_HVX_STATE 6 /*< */ +#define KEY_FUTEX_OBJECT 7 /*< */ +#define KEY_THREAD_ID 8 /*< */ +#define KEY_PROFILE_CYCLE_LO 9 // Currently not populated +#define KEY_PROFILE_CYCLE_HI 10 // Currently not populated +#define KEY_ERROR_ADDRESS 11 // This holds the BADVA +#define KEY_ERROR_CAUSE 12 // This is the same as QURT_error_info.cause +#define KEY_ERROR_CAUSE2 13 // This is the same as QURT_error_info.cause2 +#define KEY_ERROR_SSR 14 /*< Holds the SSR value */ +#define QURT_RESERVED -1 + +/* VTLB method IDs. */ +#define QURT_VTLB_ENTRY_CREATE 0U +#define QURT_VTLB_ENTRY_DELETE 1U +#define QURT_VTLB_ENTRY_READ 2U +#define QURT_VTLB_ENTRY_WRITE 3U +#define QURT_VTLB_ENTRY_PROBE 4U +#define QURT_VTLB_ENTRY_SPLIT 5U +#define QURT_VTLB_ENTRY_MERGE 6U +#define QURT_VTLB_ENTRY_STATISTICS 7U +#define QURT_VTLB_ENTRY_SET_SPECIAL 8U +#define QURT_VTLB_QUEUE_PPAGE 9U +#define QURT_VTLB_RECLAIM_STACK_PAGES 10U +#define QURT_VTLB_ASID_SET_STATE_FAST 11U +#define QURT_VTLB_ASID_SET_STATE 12U +#define QURT_VTLB_ENTRY_SET_EXTENSION 13U +#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U + +/* VTCM window access control HWIO programming. */ +#define QURT_VTCM_WINDOW_ENABLE 1U +#define QURT_VTCM_WINDOW_DISABLE 0U +#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT 0xFFFU +#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT 0U + +/** @cond */ +/* ETM source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< Memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< Memory source of SAC* is data. */ + +/* ETM PID status flags */ +#define QURT_ETM_NO_PID 0xFFFFFFFF /**< No PID is selected. */ +/** @endcond */ + +/* execution context */ +#define QURT_CTX_USER 1 +#define QURT_CTX_GUEST 2 + +/* Profiling STID */ +#define QURT_STID_DEFAULT 0U + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cycles.h new file mode 100755 index 0000000000000..b599493f5d563 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cycles.h @@ -0,0 +1,301 @@ + +#ifndef QURT_CYCLES_H +#define QURT_CYCLES_H 1 +/** + @file qurt_cycles.h + Prototypes of kernel pcycle API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ + +/**@ingroup func_qurt_profile_reset_idle_pcycles + @xreflabel{hdr:qurt_profile_reset_idle_pcycles} + Sets the per-hardware-thread idle cycle counts to zero. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_idle_pcycles (void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_thread_pcycles + @xreflabel{hdr:qurt_profile_get_thread_pcycles} + Gets the count of the running processor cycles for the current thread.\n + Returns the current running processor cycle count for the current QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @return + Integer -- Running processor cycle count for current thread. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_profile_get_thread_pcycles(void); + + +/*======================================================================*/ +/**@ingroup func_qurt_get_core_pcycles + @xreflabel{hdr:qurt_get_core_pcycles} + Gets the count of core processor cycles executed.\n + Returns the current number of running processor cycles executed since the Hexagon + processor was last reset. + + This value is based on the hardware core clock, which varies in speed according to the + processor clock frequency. + + @note1hang Because the hardware core clock stops running when the processor shuts + down (due to all of the hardware threads being idle), treat the cycle values returned + by this operation as relative rather than absolute. + + @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version. + + @return + Integer -- Current count of core processor cycles. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_get_core_pcycles(void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles + + @deprecated use #qurt_profile_get_idle_pcycles2 instead + + Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use + #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. + + This operation accepts a pointer to a user-defined array, and writes to the array the current + idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling is enabled or not, + and resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be a minimum of the number of hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_idle_pcycles (unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles2 + Gets the current idle processor cycle counts for maximum available hardware threads. + + This operation accepts a pointer to a user-defined array with length in bytes, and writes + to the array the current idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling enable status, and + resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be equivalent to the number of hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, + it returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles + + @deprecated use #qurt_profile_get_threadid_pcycles2 instead + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for a maximum of 6 hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Valid thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be a minimum of the number of + hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles2 + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for maximum available hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be equivalent to the number of + hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, it + returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long *pcycles, unsigned int length_in_bytes); + + +/*======================================================================*/ +/**@ingroup func_qurt_profile_reset_threadid_pcycles + @xreflabel{hdr:qurt_profile_reset_threadid_pcycles} + Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread. + + @param[in] thread_id Thread identifier. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_threadid_pcycles (int thread_id); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_enable + @xreflabel{hdr:qurt_profile_enable} + Enables profiling.\n + Enables or disables cycle counting of the running and idle processor cycles. + Profiling is disabled by default. \n + + @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be + done explicitly by calling the reset operations before starting cycle counting. + Cycle counting starts from the instant of it was enabled using this API, and + halts on profiling disable. + + @param[in] enable Profiling. Values: \n + - 0 -- Disable profiling \n + - 1 -- Enable profiling @tablebulletend + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_enable (int enable); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_pcycles + @xreflabel{hdr:qurt_get_hthread_pcycles} + Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values are 1 through . + + + @return + Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed + from reset to current point of execution when n threads are in run mode + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_pcycles(int n); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_commits + @xreflabel{hdr:qurt_get_hthread_commits} + Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values: 1 through . + + @return + Value read from the GCOMMIT_nT register. This value indicates the total number of packets + committed from reset to current point of execution when n threads are in run mode. + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_commits(int n); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_devtree.h new file mode 100755 index 0000000000000..4adee45bb44a2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_devtree.h @@ -0,0 +1,161 @@ +#ifndef QURT_DEVTREE_H +#define QURT_DEVTREE_H +/** + @file qurt_devtree.h + @brief Prototypes and structures for device tree aware QuRT library function. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +*/ +/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def. + callback is not used here, so define NULL here to avoid including the world*/ +#ifndef NULL +#define NULL ((void *) 0) +#endif + +#include "libfdt.h" +#include "DTBExtnLib.h" +#include "qurt_qdi_ext.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_BLOB_ID (-1) +#define DEFAULT_BLOB_ID 0 + +/** QURT Device Tree Mapping Macros */ +#define QURT_DT_MAPPING_FAILED (-1) +#define QURT_DT_FLAG_ISLAND 0x1 +#define QURT_DT_FLAG_PHYSADDR 0x2 + +/** Device Tree type for Root PD Device tree. +    Root PD Device Tree will typically describe the hardware in the subsystem. +    This is the /soc portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_ROOT 0 + +/** Device Tree type for Local Device tree. +    Local Device Tree will typically contain the software settings. +    This is the /sw portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_LOCAL 1 + +int qurt_devtree_init(void); + +/**@ingroup func_qurt_dt_mapping_create + Creates a memory mapping from the specified property of the specified device + tree node. Returns virtual addresses and sizes. + + @param[in] offset Device tree node offset. + @param[in] flags Flags to configure memory. Overloaded as property + index if reg_name is NULL. + @param[in] reg_name Identifies property to use for mapping, should + resemble a region. + @param[out] vaddr Return pointer for the virtual region address. + @param[out] size Return pointer for the virtual region size. + + @return + Result code indicating success or failure \n +*/ +int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, + unsigned long long *vaddr, unsigned long long *size); + +/**@ingroup func_qurt_dt_mapping_create2 + + Creates a memory mapping from the specified property of the specified device + tree node. + + Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). + + @param[in] devtreeNode Device Tree node + + @param[in] dt_map_flags Flags to configure memory mapping and are reserved for future purpose. + (0) - Default value assumes details from DT node are phys address, size. + QURT_DT_FLAG_ISLAND + + NOTE: The PA needs to be added to corresponding island spec to create an island mapping + + @param[in] regionName NULL or name of index in range to return, should + resemble a region. Ex.reg-names = "base", "rx", "tx"; + + @param[in] regionIdx Index of range to return. Ex reg = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >; + + NOTE: If client specifies both re_name & regionIdx. The precedence of + region name is taken over and region index is ignored. + + @param[in] dt_map_perm Mapping access permissions(R/W), + QURT_PERM_READ + QURT_PERM_WRITE + + @param[in] cache_attr QuRT cache mode type's : + QURT_MEM_CACHE_DEVICE + QURT_MEM_CACHE_WRITEBACK + Other required cache type enums in qurt_types.h can also be passed. + + NOTE: No default value for cache & perm is present. + Client always needs to pass any of defined the flags. + + @param[out] vaddr Return pointer to the variable that holds the virtual address + @param[out] size Return pointer for the virtual region size. + + @return + #QURT_EOK Success indicating mapping created properly. + #QURT_DT_MAPPING_FAILED Failed to create mapping. + #QURT_EINVALID Mismatch in the architecture. + + else FdtLib or thirdparty error code. + +*/ +int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, + char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size); + +/**@ingroup func_qurt_dt_isr_register + Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. + The interrupt defined in the specified device tree node is enabled when this function returns success. + + @datatypes + #qurt_thread_t \n + #fdt_node_handle + + @param[in] dt_node Device tree node that specifies the interrupt property. + @param[in] dt_int_index Index of the specific interrupt to use within the device tree node structure. + Specify either this or int_name, use -1 if string is used. + @param[in] dt_int_name Name of the specific interrupt to use within the device tree node structure. + Either this or int_index should be specified, use NULL if index is used + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2(). + @param[in] prio Priority of the ISR, defined by qurt_isr_register2(). + @param[in] flags Defines ACK type. Values : \n + #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the kernel. + #QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + Defined by qurt_isr_register2(). + @param[in] isr ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2(). + @param[in] arg First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2(). + + @return + #QURT_EOK -- Successfully registered the ISR for the interrupt \n + #QURT_EINT -- Interrupt not configured \n + #QURT_EINVALID -- Invalid thread ID \n + #QURT_EDISABLED -- The feature is disabled \n + #QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Create the thread ID qurt_isr_create(). + ISR registration completed with qurt_isr_register2(). + */ +int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, + unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_dt_blob_id_get + Returns the Blob ID for the Blob type passed. + The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs. + + @param[in] blob_type  Blob type to look up. + @return Blob ID for the passed Blob Type. +*/ +int qurt_dt_blob_id_get(unsigned int blob_type); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ecc.h new file mode 100755 index 0000000000000..09312684e99af --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ecc.h @@ -0,0 +1,168 @@ +#ifndef QURT_ECC_H +#define QURT_ECC_H + + +/*===================================================================== + + @file qurt_ecc.h + @brief Prototypes of QuRT memory ECC API functions + + Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup exception_handling_types +@{ */ +// ECC memory definition +typedef enum { + QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */ + QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/ + QURT_ECC_MEM_L2_CACHE = 2, /**< ECC memory L2 Cache.*/ + QURT_ECC_MEM_VTCM = 3 /**< ECC memory VTCM.*/ +} qurt_ecc_memory_t; +/** @} */ /* end_addtogroup exception_handling_types */ + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup exception_handling_macros +@{ */ + +#define QURT_ECC_ERR_DETECTED_STATUS 0 /**< ECC error detected. */ +#define QURT_ECC_ERR_TYPE 1 /**< ECC error type.*/ +// ECC status type + +#define QURT_ECC_CORRECTABLE_COUNT (1<<0) /**< ECC correctable count.*/ +#define QURT_ECC_UNCORRECTABLE_COUNT (1<<1) /**< ECC uncorrectable count.*/ +#define QURT_ECC_REGION_LOGGING (1<<2) /**< ECC region logging.*/ +// ECC enable/disable definition + +#define QURT_ECC_PROTECTION_DISABLE (0<<0) /**< Bit 0. */ +#define QURT_ECC_PROTECTION_ENABLE (1<<0) /**< Bit 0. */ +/** @} */ /* end_addtogroup exception_handling_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_ecc_enable + Enables or disables ECC protection on a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] enable Set to one of the following values: + - #QURT_ECC_PROTECTION_ENABLE + - #QURT_ECC_PROTECTION_DISABLE @tablebulletend + + @return + - #QURT_EOK -- ECC enabling or disabling setup is performed successfully + - Others -- Failure + + @dependencies + None. + */ +int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable ); + + +/**@ingroup func_qurt_ecc_get_error_status + Gets ECC error status for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following: + - #QURT_ECC_ERR_DETECTED_STATUS + - #QURT_ECC_ERR_TYPE @tablebulletend + + @return + Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS: + - 0 -- No error detected \n + - 1 -- At least one error detected \n + Returns the following when the type is #QURT_ECC_ERR_TYPE: \n + - 0 through 1 -- Correctable error \n + - 2 -- Uncorrectable error + + @dependencies + None. + */ +int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_get_error_count + Gets the ECC error count for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values:\n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT @tablebulletend + + @return + Error count for the specified error type. + + @dependencies + None. + */ +int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_clear_error_count + Clears ECC error count or region logging for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: \n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one or multiple OR'ed of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT \n + - #QURT_ECC_REGION_LOGGING @tablebulletend + + @return + #QURT_EOK -- Error count successfully cleared \n + Others -- Failure at clearing the error count + + @dependencies + None. + */ +int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type ); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ECC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_error.h new file mode 100755 index 0000000000000..f4666b396c378 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_error.h @@ -0,0 +1,149 @@ +#ifndef QURT_ERROR_H +#define QURT_ERROR_H + +/** + @file qurt_error.h + Error results- QURT defines a set of standard symbols for the error result values. This file lists the + symbols and their corresponding values. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021-2022 , 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ +#include "qurt_except.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup chapter_error +@{ */ + +/*===================================================================== +Constants and macros +======================================================================*/ +#define QURT_EOK 0 /**< Operation successfully performed. */ +#define QURT_EVAL 1 /**< Wrong values for the parameters. The specified page does not exist. */ +#define QURT_EMEM 2 /**< Not enough memory to perform the operation.*/ + +#define QURT_EINVALID 4 /**< Invalid argument value; invalid key. */ +/** @cond */ +#define QURT_EUNKNOWN 6 /**< Defined but never used in QuRT. */ +#define QURT_ENOMSGS 7 /**< Message queue is empty. */ +#define QURT_EBADF 9 /**< Bad message queue descriptor. */ +/** @endcond */ +#define QURT_EFAILED 12 /**< Operation failed. */ + +#define QURT_ENOTALLOWED 13 /**< Operation not allowed. */ + +/** @cond */ +#define QURT_EDUPCLSID 14 /*< Duplicate class ID. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOREGISTERED 20 /**< No registered interrupts.*/ +/** @endcond */ + + +/** @cond */ +#define QURT_EISDB 21 /*< Power collapse failed due to ISDB being enabled. */ +#define QURT_ESTM 22 /*< Power collapse failed in a Single-threaded mode check. */ +/** @endcond */ + + +/** @cond rest_reg_dist */ +#define QURT_ETLSAVAIL 23 /**< No free TLS key is available. */ +#define QURT_ETLSENTRY 24 /**< TLS key is not already free. */ +/** @endcond */ + +#define QURT_EINT 26 /**< Invalid interrupt number (not registered). */ +/** @cond rest_reg_dist */ +#define QURT_ESIG 27 /**< Invalid signal bitmask (cannot set more than one signal at a time). */ +/** @endcond */ + +/** @cond */ +#define QURT_EHEAP 28 /**< No heap space is available. */ +#define QURT_ENOSPC 28 /**< No space to create another queue in the system. */ +#define QURT_EMEMMAP 29 /**< Physical address layout is not supported by the kernel. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOTHREAD 30 /**< Thread no longer exists. */ +/** @endcond */ +/** @cond */ +#define QURT_EL2CACHE 31 /**< L2cachable is not supported in kernel invalidate/cleaninv. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_EALIGN 32 /**< Not aligned. */ +#define QURT_EDEREGISTERED 33 /**< Interrupt is already deregistered.*/ +/** @endcond */ + +/** @cond internal_only */ + +#define QURT_ETLBCREATESIZE 34 /**< TLB create error -- Incorrect size.*/ +#define QURT_ETLBCREATEUNALIGNED 35 /**< TLB create error -- Unaligned address.*/ +/** @endcond */ +/** @cond rest_reg_dist*/ +#define QURT_EEXISTS 35 /**< File or message queue already exists. */ +#define QURT_ENAMETOOLONG 36 /**< Name too long for message queue creation. */ +#define QURT_EPRIVILEGE 36 /**< Caller does not have privilege for this operation.*/ + +#define QURT_ECANCEL 37 /**< A cancellable request was canceled because the associated process was asked to exit.*/ +/** @endcond */ + +/** @cond */ +#define QURT_EISLANDTRAP 38 /*< Unsupported TRAP is called in Island mode.*/ + +#define QURT_ERMUTEXUNLOCKNONHOLDER 39 /*< Rmutex unlock by a non-holder.*/ +#define QURT_ERMUTEXUNLOCKFATAL 40 /*< Rmutex unlock error, all except the non-holder error.*/ +#define QURT_EMUTEXUNLOCKNONHOLDER 41 /*< Mutex unlock by a non-holder.*/ +#define QURT_EMUTEXUNLOCKFATAL 42 /*< Mutex unlock error, all except the non-holder error.*/ +#define QURT_EINVALIDPOWERCOLLAPSE 43 /*< Invalid power collapse mode requested. */ +/** @endcond */ +#define QURT_EISLANDUSEREXIT 44 /**< User call has resulted in island exit.*/ +#define QURT_ENOISLANDENTRY 45 /**< Island mode had not yet been entered.*/ +#define QURT_EISLANDINVALIDINT 46 /**< Exited Island mode due to an invalid island interrupt.*/ +/** @cond rest_reg_dist */ +#define QURT_ETIMEDOUT 47 /**< Operation timed-out. */ +#define QURT_EALREADY 48 /**< Operation already in progress. */ +/** @endcond */ + +#define QURT_ERETRY 49 /*< Retry the operation. */ +#define QURT_EDISABLED 50 /*< Resource disabled. */ +#define QURT_EDUPLICATE 51 /*< Duplicate resource. */ +#define QURT_EBADR 53 /*< Invalid request descriptor. */ +#define QURT_ETLB 54 /*< Exceeded maximum allowed TLBs. */ +#define QURT_ENOTSUPPORTED 55 /*< Operation not supported. */ +/** @cond rest_reg_dist */ +#define QURT_ENORESOURCE 56 /**< No resource. */ +/** @endcond */ + +#define QURT_EDTINIT 57 /**< Problem with device tree intialization. */ +#define QURT_EBUFLOCK 58 /*< Buffer lock failed because it was already locked many times. */ +#define QURT_ELOCKED 59 /**< Current operation failed as the buffer is locked. */ +#define QURT_EMSGSIZE 90 /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */ + + +#define QURT_ENOTCONFIGURED 91 /*< Interrupt is NOT configured. */ + +#define QURT_EBANDWIDTHLIMIT 92 /*< Message queue send exceed the bandwidth limit. */ + +#define QURT_ECFIVIOLATION 93 /*< CFI violation detected. */ + +#define QURT_EDESTROY 94 /**< A destroy request was made to waiting threads.*/ + +#define QURT_EHMXNOTAVAIL 95 /**< HMX is not available to target thread.*/ +#define QURT_EHMXNOTDETACHABLE 96 /**< HMX is not detachable from target thread.*/ + +#define QURT_EFATAL -1 /**< Fatal error. */ + +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ERROR_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_event.h new file mode 100755 index 0000000000000..987f0fe79f227 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_event.h @@ -0,0 +1,452 @@ +#ifndef QURT_EVENT_H +#define QURT_EVENT_H +/** + @file qurt_event.h + @brief Prototypes of kernel event API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "qurt_consts.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * System environment object type. + */ +/**@addtogroup sys_env_types +@{ */ +/** QuRT swap pool information type. */ +typedef struct qurt_sysenv_swap_pools { + /** @cond */ + unsigned int spoolsize; /* Swap pool size.*/ + unsigned int spooladdr; /* Swap pool start address.*/ + /** @endcond */ +}qurt_sysenv_swap_pools_t; + +/**QuRT application heap information type. */ +typedef struct qurt_sysenv_app_heap { + /** @cond */ + unsigned int heap_base; /* Heap base address.*/ + unsigned int heap_limit; /* Heap end address.*/ + /** @endcond */ +} qurt_sysenv_app_heap_t ; + +/** QuRT architecture version information type. */ +typedef struct qurt_sysenv_arch_version { + /** @cond */ + unsigned int arch_version; /*Architecture version.*/ + /** @endcond */ +}qurt_arch_version_t; + +/** QuRT maximum hardware threads information type. */ +typedef struct qurt_sysenv_max_hthreads { + /** @cond */ + unsigned int max_hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_max_hthreads_t; + +/** QuRT active hardware threads information type. */ +typedef struct qurt_sysenv_hthreads { + /** @cond */ + unsigned int hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_hthreads_t; + +/** QuRT maximum pi priority information type. */ +typedef struct qurt_sysenv_max_pi_prio { + /** @cond */ + unsigned int max_pi_prio; /*Maximum pi priority.*/ + /** @endcond */ +}qurt_sysenv_max_pi_prio_t; + +/** QuRT process name information type. */ +typedef struct qurt_sysenv_procname { + /** @cond */ + union { + unsigned int asid; /*Address space ID.*/ + unsigned int pid; /*Process ID.*/ + }; + char name[QURT_MAX_NAME_LEN]; /* Process name.*/ + /** @endcond */ +}qurt_sysenv_procname_t; + +/** QuRT stack profile count information type. */ +typedef struct qurt_sysenv_stack_profile_count { + /** @cond */ + unsigned int count; /*Stack profile count for usage.*/ + unsigned int count_watermark; /*Stack profile count for watermark.*/ + /** @endcond */ +}qurt_sysenv_stack_profile_count_t; + +/** + QuRT system error event type. + */ +typedef struct _qurt_sysevent_error_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + } qurt_sysevent_error_t ; + +typedef struct _qurt_sysevent_error_1_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + unsigned int fkey; /**< Framekey.*/ + unsigned int reserved1; /**< Reserved.*/ + unsigned int reserved2; /**< Reserved.*/ + unsigned int reserved3; /**< Reserved.*/ + } qurt_sysevent_error_1_t ; + +/** QuRT page fault error event information type. */ +typedef struct qurt_sysevent_pagefault { + qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */ + unsigned int fault_addr; /**< Accessed address that caused the page fault. */ + unsigned int ssr_cause; /**< SSR cause code for the page fault. */ +} qurt_sysevent_pagefault_t ; +/** @} */ /* @endaddtogroup sys_env_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/*======================================================================*/ +/** + Gets the environment swap pool 0 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools ); + +/* + Gets the environment swap pool 1 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools ); + +/**@ingroup func_qurt_sysenv_get_app_heap + Gets information on the program heap from the kernel. + + @datatypes + #qurt_sysenv_app_heap_t + + @param[out] aheap Pointer to information on the program heap. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap ); + +/**@ingroup func_qurt_sysenv_get_arch_version + Gets the Hexagon processor architecture version from the kernel. + + @datatypes + #qurt_arch_version_t + + @param[out] vers Pointer to the Hexagon processor architecture version. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter + + @dependencies + None. +*/ +int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers); + +/**@ingroup func_qurt_sysenv_get_max_hw_threads + Gets the maximum number of hardware threads supported in the Hexagon processor. + The API includes the disabled hardware threads to reflect the maximum + hardware thread count. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, only HW0 and HW2 are initialized by QuRT. + HW1 and HW3 are not used at all. Under such a scenario, + qurt_sysenv_get_max_hw_threads() still returns four. + + @datatypes + #qurt_sysenv_max_hthreads_t + + @param[out] mhwt Pointer to the maximum number of hardware threads supported in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_hw_threads + Gets the number of hardware threads initialized by QuRT in Hexagon processor. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, QuRT only initializes HW0 and HW2. + HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2. + + @datatypes + #qurt_sysenv_hthreads_t + + @param[out] mhwt Pointer to the number of hardware threads active in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_max_pi_prio + Gets the maximum priority inheritance mutex priority from the kernel. + + @datatypes + #qurt_sysenv_max_pi_prio_t + + @param[out] mpip Pointer to the maximum priority inheritance mutex priority. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip ); + +/**@ingroup func_qurt_sysenv_get_process_name2 + Gets information on the system environment process names based on the client_handle argument. + + @datatypes + #qurt_sysenv_procname_t + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_process_name + Gets information on the system environment process names from the kernel. + + @datatypes + #qurt_sysenv_procname_t + + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_stack_profile_count + Gets information on the stack profile count from the kernel. + + @datatypes + #qurt_sysenv_stack_profile_count_t + + @param[out] count Pointer to information on the stack profile count. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count ); + +/**@ingroup func_qurt_exception_wait + Registers the program exception handler. + This function assigns the current thread as the QuRT program exception handler and suspends the + thread until a program exception occurs. + + When a program exception occurs, the thread is awakened with error information + assigned to the parameters of this operation. + + @note1hang If no program exception handler is registered, or if the registered handler + calls exit, QuRT raises a kernel exception. + If a thread runs in Supervisor mode, any errors are treated as kernel + exceptions. + + @param[out] ip Pointer to the instruction memory address where the exception occurred. + @param[out] sp Stack pointer. + @param[out] badva Pointer to the virtual data address where the exception occurred. + @param[out] cause Pointer to the QuRT error result code. + + @return + Registry status: \n + Thread identifier -- Handler successfully registered. \n + #QURT_EFATAL -- Registration failed. + + @dependencies + None. +*/ +unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp, + unsigned int *badva, unsigned int *cause); + +unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err); + +/**@ingroup func_qurt_exception_wait3 + Registers the current thread as the QuRT program exception handler, and suspends the thread until a + program exception occurs. + When a program exception occurs, the thread is awakened with error information assigned to the specified + error event record. + If a program exception is raised when no handler is registered (or when a handler is registered, but it calls + exit), the exception is treated as fatal.\n + @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n + @note1cont This function differs from qurt_exception_wait() by returning the error information in a data + structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR). + + @param[out] sys_err Pointer to the qurt_sysevent_error_1_t type structure. + @param[in] sys_err_size Size of the qurt_sysevent_error_1_t structure. + + @return + Registry status: \n + - #QURT_EFATAL -- Failure. \n + - Thread ID -- Success. + + @dependencies + None. +*/ + +unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size); + +/**@ingroup func_qurt_exception_raise_nonfatal + Raises a nonfatal program exception in the QuRT program system. + + For more information on program exceptions, see Section @xref{dox:exception_handling}. + + This operation never returns -- the program exception handler is assumed to perform all + exception handling before terminating or reloading the QuRT program system. + + @note1hang The C library function abort() calls this operation to indicate software + errors. + + @param[in] error QuRT error result code (Section @xref{dox:error_results}). + + @return + Integer -- Unused. + + @dependencies + None. +*/ +int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn)); + + +/**@ingroup func_qurt_exception_raise_fatal + Raises a fatal program exception in the QuRT system. + + Fatal program exceptions terminate the execution of the QuRT system without invoking + the program exception handler. + + For more information on fatal program exceptions, see Section @xref{dox:exception_handling}. + + This operation always returns, so the calling program can perform the necessary shutdown + operations (data logging, on so on). + + @note1hang Context switches do not work after this operation has been called. + + @return + None. + + @dependencies + None. +*/ +void qurt_exception_raise_fatal (void); + +unsigned int qurt_enable_floating_point_exception(unsigned int mask); + +/**@ingroup func_qurt_exception_enable_fp_exceptions + Enables the specified floating point exceptions as QuRT program exceptions. + + The exceptions are enabled by setting the corresponding bits in the Hexagon + control user status register (USR). + + The mask argument specifies a mask value identifying the individual floating + point exceptions to set. The exceptions are represented as defined symbols + that map into bits 0 through 31 of the 32-bit flag value. + Multiple floating point exceptions are specified by OR'ing together the individual + exception symbols.\n + @note1hang This function must be called before performing any floating point operations. + + @param[in] mask Floating point exception types. Values: \n + - #QURT_FP_EXCEPTION_ALL \n + - #QURT_FP_EXCEPTION_INEXACT \n + - #QURT_FP_EXCEPTION_UNDERFLOW \n + - #QURT_FP_EXCEPTION_OVERFLOW \n + - #QURT_FP_EXCEPTION_DIVIDE0 \n + - #QURT_FP_EXCEPTION_INVALID @tablebulletend + + @return + Updated contents of the USR. + + @dependencies + None. +*/ + +static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask) +{ + return qurt_enable_floating_point_exception(mask); +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EVENT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_except.h new file mode 100755 index 0000000000000..e1684c80e3d50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_except.h @@ -0,0 +1,185 @@ +#ifndef QURT_EXCEPT_H +#define QURT_EXCEPT_H + +/** + @file qurt_except.h + @brief Defines Cause and Cause2 codes for error-handling. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. + + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + QuRT supports error handling to handle CPU detected exceptions and software errors. + QuRT treats all errors as either fatal errors or nonfatal errors. + + @section sec1 Fatal errors + All supervisor mode exceptions are treated as fatal errors. + If a registered exception handler calls qurt_exit(), it is treated as a fatal error. + Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. + All hardware threads are eventually stopped and the cache is flushed. + NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n + + @subsection subsection1 Debugging fatal errors + - QURT_error_info.status.status -- Indicates that an error occured. + - QURT_error_info.status.cause -- Cause code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.cause2 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.fatal -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered. + - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error. + - QURT_error_info.global_regs -- Contains the values of the global registers of Q6 + - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error. + + + + @subsection subsection2 Debugging nonfatal errors + - QURT_error_info.user_errors -- All user errors are logged here. + - QURT_error_info.user_errors.counter -- Index to last logged error. + - QURT_error_info.user_errors.entry[0...counter] -- Structure for logged error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb -- TCB for the user error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID. + - QURT_error_info.user_errors.entry[0...counter].error_code -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below. + - QURT_error_info.user_errors.entry[0...counter].hw_thread -- Hardware thread ID for error. + - QURT_error_info.user_errors.entry[0...counter].pcycle -- Pcycle for error. + +@note + Important usage note: + Cause and Cause2 are error codes to distinguish multiple errors. + SSR and BADAVA are inconclusive without the vector number. + All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code. + Hence the system can have up to 255 * 255 unique error codes. + The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) ) + Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes. + SSR cause codes are defined in Hexagon reference manual. + All possible combinations are listed below. +*/ +/** @addtogroup chapter_error +@{ */ +/* cause - error type - 8-bits*/ +#define QURT_EXCEPT_PRECISE 0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/ +#define QURT_EXCEPT_NMI 0x02U /**< NMI occurred; Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS 0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_RSVD_VECTOR 0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */ +#define QURT_EXCEPT_ASSERT 0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below. */ +#define QURT_EXCEPT_BADTRAP 0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */ +#define QURT_EXCEPT_UNDEF_TRAP1 0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */ +#define QURT_EXCEPT_EXIT 0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */ +#define QURT_EXCEPT_TLBMISS_X 0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */ +#define QURT_EXCEPT_STOPPED 0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */ +#define QURT_EXCEPT_FATAL_EXIT 0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */ +#define QURT_EXCEPT_INVALID_INT 0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */ +#define QURT_EXCEPT_FLOATING_POINT 0x0EU /**< Kernel received an floating point error. Cause2 is not defined. */ +#define QURT_EXCEPT_DBG_SINGLE_STEP 0x0FU /**< Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS_RW_ISLAND 0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */ +#define QURT_EXCEPT_TLBMISS_X_ISLAND 0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_SYNTHETIC_FAULT 0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */ +#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */ +#define QURT_EXCEPT_UNDEF_TRAP0 0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */ +#define QURT_EXCEPT_PRECISE_DMA_ERROR 0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */ + +#define QURT_ECODE_UPPER_LIBC (0U << 16) /**< Upper 16 bits is 0 for libc. */ +#define QURT_ECODE_UPPER_QURT (0U << 16) /**< Upper 16 bits is 0 for QuRT. */ +#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16) /**< Upper 16 bits is 2 for error service. */ +/** @cond */ +#define QURT_ECODE_ISLAND_INVALID_QDI 3U /**< Passing invalid QDI method in island. */ +/** @endcond */ + +/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */ +#define QURT_SYNTH_ERR 0x01U /**< */ +#define QURT_SYNTH_INVALID_OP 0x02U /**< */ +#define QURT_SYNTH_DATA_ALIGNMENT_FAULT 0x03U /**< */ +#define QURT_SYNTH_FUTEX_INUSE 0x04U /**< */ +#define QURT_SYNTH_FUTEX_BOGUS 0x05U /**< */ +#define QURT_SYNTH_FUTEX_ISLAND 0x06U /**< */ +#define QURT_SYNTH_FUTEX_DESTROYED 0x07U /**< */ +#define QURT_SYNTH_PRIVILEGE_ERR 0x08U /**< */ + +/* Cause2 - Abort cause reason - 8 bits */ +/* ERR_ASSERT cause */ +#define QURT_ABORT_FUTEX_WAKE_MULTIPLE 0x01U /**< Abort cause - futex wake multiple. */ +#define QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE 0x02U /**< Abort cause - thread waiting to wake up in Single Threaded mode. */ +#define QURT_ABORT_TCXO_SHUTDOWN_NOEXIT 0x03U /**< Abort cause - call TCXO shutdown without exit. */ +#define QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL 0x04U /**< Abort cause - futex allocation queue failure - QURTK_futexhash_lifo empty. */ +#define QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT 0x05U /**< Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */ +#define QURT_ABORT_THREAD_SCHEDULE_SANITY 0x06U /**< Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */ +#define QURT_ABORT_REMAP 0x07U /**< Remap in the page table; the correct behavior must remove mapping if necessary. */ +#define QURT_ABORT_NOMAP 0x08U /**< No mapping in page table when removing a user mapping. */ +#define QURT_ABORT_OUT_OF_SPACES 0x09U +#define QURT_ABORT_INVALID_MEM_MAPPING_TYPE 0x0AU /**< Invalid memory mapping type when creating qmemory. */ +#define QURT_ABORT_NOPOOL 0x0BU /**< No pool available to attach. */ +#define QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM 0x0CU /**< Cannot allocate more futex waiting queue. */ +#define QURT_ABORT_ARG_ERROR 0x0DU +#define QURT_ABORT_ASSERT 0x0EU /**< Assert abort. */ +#define QURT_ABORT_FATAL 0x0FU /**< Fatal error; must never occur. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE 0x10U /**< Abort cause - invalid queue ID in futex resume. */ +#define QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE 0x11U /**< Abort cause - invalid queue ID in futex wait. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX 0x12U /**< Abort cause - invalid futex object in hashtable. */ +#define QURT_ABORT_NO_ERHNDLR 0x13U /**< No registered error handler. */ +#define QURT_ABORT_ERR_REAPER 0x14U /**< Exception in the reaper thread. */ +#define QURT_ABORT_FREEZE_UNKNOWN_CAUSE 0x15U /**< Abort in thread freeze operation. */ +#define QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE 0x16U /**< During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */ +#define QURT_ABORT_ERR_ISLAND_EXP_HANDLER 0x17U /**< Exception in Island exception handler task. */ +#define QURT_ABORT_L2_TAG_DATA_CHECK_FAIL 0x18U /**< Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */ +#define QURT_ABORT_ERR_SECURE_PROCESS 0x19U /**< Abort error in secure process. */ +#define QURT_ABORT_ERR_EXP_HANDLER 0x20U /**< No exception handler, or the handler caused an exception. */ +#define QURT_ABORT_ERR_NO_PCB 0x21U /**< PCB of the thread context failed initialization, PCB was NULL. */ +#define QURT_ABORT_NO_PHYS_ADDR 0x22U /**< Unable to find the physical address for the virtual address. */ +#define QURT_ABORT_OUT_OF_FASTINT_CONTEXTS 0x23U /**< Fast interrupt contexts exhausted. */ +#define QURT_ABORT_CLADE_ERR 0x24U /**< Fatal error seen with CLADE interrupt. */ +#define QURT_ABORT_ETM_ERR 0x25U /**< Fatal error seen with ETM interrupt. */ +#define QURT_ABORT_ECC_DED_ASSERT 0x26U /**< ECC two-bit DED error. */ +#define QURT_ABORT_VTLB_ERR 0x27U /**< Fatal error in the VTLB layer. */ +#define QURT_ABORT_TLB_ENCODE_DECODE_FAILURE 0x28U /**< Failure during the TLB encode or decode operation. */ +#define QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE 0x29U /**< Failure to lookup entry in the page table. */ +#define QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE 0x30U /**< Failure to claim phy memory ownership. */ +#define QURT_ABORT_JTLB_SIZE_CHECK_FAIL 0x31U /**< JTLB size configured is more than actual size in hardware */ +#define QURT_ABORT_AUTOSTACK_ASSERT 0x32U /**< Error while handling stack flimit exception. */ + +/* Cause2 - TLB-miss_X - 8bits */ +#define QURT_TLB_MISS_X_FETCH_PC_PAGE 0x60U /**< */ +#define QURT_TLB_MISS_X_2ND_PAGE 0x61U /**< */ +#define QURT_TLB_MISS_X_ICINVA 0x62U /**< */ + +/* Cause2 - TLB-miss_RW - 8bits */ +#define QURT_TLB_MISS_RW_MEM_READ 0x70U /**< */ +#define QURT_TLB_MISS_RW_MEM_WRITE 0x71U /**< */ + +/** @cond rest_reg_dist */ +/* Cause2 - Floating point exception - 8 bits */ +#define QURT_FLOATING_POINT_EXEC_ERR 0xBFU /**< Execute floating-point. */ +/** @endcond */ + +/** Cause2 - autostackv2 - 8 bits */ +#define QURT_AUTOSTACKV2_CANARY_NOT_MATCH 0xC1U +#define QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE 0xC2U + +/** Cause2 - CFI violation - 8 bits */ +#define QURT_CFI_VIOLATION 0xC3U + +/** @cond rest_reg_dist*/ +/* Enable floating point exceptions */ +#define QURT_FP_EXCEPTION_ALL 0x1FU << 25 /**< */ +#define QURT_FP_EXCEPTION_INEXACT 0x1U << 29 /**< */ +#define QURT_FP_EXCEPTION_UNDERFLOW 0x1U << 28 /**< */ +#define QURT_FP_EXCEPTION_OVERFLOW 0x1U << 27 /**< */ +#define QURT_FP_EXCEPTION_DIVIDE0 0x1U << 26 /**< */ +#define QURT_FP_EXCEPTION_INVALID 0x1U << 25 /**< */ + +/** @endcond */ +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EXCEPT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fastint.h new file mode 100755 index 0000000000000..ea65dc0917fc0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fastint.h @@ -0,0 +1,71 @@ +#ifndef QURT_FASTINT_H +#define QURT_FASTINT_H + +/** + @file qurt_fastint.h + @brief QuRT fast interrupt functions + + Copyright (c) 2013-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_fastint_register + Register fast interrupt callback function + + Fast interrupt callback should be designed to perform the minimal necessary + actions for the interrupt, and/or perform some operations, such as signaling + another regular software thread to start any additional processing. + The callback should be a fast and short function. When a fast interrupt callback + is running, the corresponding interrupt cannot be re-enabled until the callback + returns. + + The fast interrupt callback must not use any system blocking calls, such as + mutex lock or signal wait. Otherwise, it results in errors. + + The fast interrupt callback function has a single integer argument and the + function ends with no return. The argument value passed in is the interrupt + number, and therefore a single callback function can handle + multiple fast interrupts. + + @param[in] intno Interrupt number to register. + @param[in] fn Interrupt callback function. + + @return + #QURT_EOK -- Fast interrupt registration is successful. \n + #QURT_EINVALID -- Interrupt is already registered. \n + #QURT_EINT -- Invalid interrupt number. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_register(int intno, void (*fn)(int)); + + +/*======================================================================*/ +/**@ingroup func_qurt_fastint_deregister + Deregisters the fast interrupt callback function. + + @param[in] intno Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 + (simulator only). + + @return + #QURT_EOK -- Interrupt deregistration is successful. \n + #QURT_EINT -- Invalid interrupt number (not registered). \n + #QURT_EINVALID -- Invalid interrupt number (already deregistered). + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_deregister(int intno); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FASTINT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fs_hub.h new file mode 100755 index 0000000000000..aaa050a6c838b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fs_hub.h @@ -0,0 +1,58 @@ +#ifndef QURT_FS_HUB_H +#define QURT_FS_HUB_H + +/** + @file qurt_fs_hub.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver that provides file-system functionality. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + This structure tracks a file-designator for a FS-hub QDI driver. + File system's QDI interface should use this object to encapsulate + true file-descriptor and return back a QDI handle. This QDI handle + will be used as file-descriptor by File-systm-hub. + */ + +typedef struct qurt_qdi_fs_obj +{ + qurt_qdi_obj_t qdi_obj; + int client_handle; + int fd; +}qurt_qdi_fs_obj_t; + + +/**@ingroup fs_hub_support_functions + This function allows a file-system to register it's QDI interface with file-system-hub. + Once registered, all file open operations for any filenames containing the mountpoint will + be forwarded to the QDI inteface. + + Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/" + + @param mtpoint mount point for the file-system being registered. + @param opener opener structure for the QDI driver interface + + @return + QURT_EOK -- Successfully registered QDI driver with file-system-hub. + Negative error code -- Failed to register with file-system-hub + */ +int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_futex.h new file mode 100755 index 0000000000000..1fdcc79a43f01 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_futex.h @@ -0,0 +1,82 @@ +#ifndef QURT_FUTEX_H +#define QURT_FUTEX_H +/** + @file qurt_futex.h + + @brief Prototypes of QuRT futex API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Functions +======================================================================*/ + + +/**@ingroup func_qurt_futex_wait + Moves the caller thread into waiting state when a memory object address + contains a value that is the same as a specified value. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait(void *lock, int val); + + +/**@ingroup func_qurt_futex_wait_cancellable + If a memory object address contains a value that is same as a specified + value, move the caller thread into waiting state. + The kernal can cancel the waiting state when there is a special need. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait_cancellable(void *lock, int val); + + +/**@ingroup func_qurt_futex_wake + Wakes up a specified number of threads that have been waiting + for the object change with qurt_futex_wait(). + + @param[in] lock Pointer to the object memory. + @param[in] n_to_wake Maximum number of threads to wake up. + + @return + number of threads to be woken up by this function + + @dependencies + None. + */ +int qurt_futex_wake(void *lock, int n_to_wake); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hmx.h new file mode 100755 index 0000000000000..e4037dbeae514 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hmx.h @@ -0,0 +1,226 @@ +#ifndef QURT_HMX_H +#define QURT_HMX_H +/** + @file qurt_hmx.h + @brief Prototypes of Qurt HMX API. + +Copyright (c) 2019-2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + + +/** @addtogroup hmx_types +@{ */ +/* HMX locking type */ +#define QURT_HMX_NON_SHARED_LOCK 0U /**< HMX locking type.*/ +#define QURT_HMX_SHARED_LOCK 1U /**< HMX locking type.*/ + +/* HMX unlocking type */ +#define QURT_HMX_NON_SHARED_UNLOCK 0U /**< HMX unlocking type.*/ +#define QURT_HMX_SHARED_UNLOCK 1U /**< HMX unlocking type.*/ + +/* HMX hardware context */ +#define QURT_HMX_UNIT_0 0U /**< HMX hardware context #0 */ +#define QURT_HMX_UNIT_1 1U /**< HMX hardware context #1 */ + /** @} */ /* end_addtogroup hmx_types */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_hmx_lock2 + Locks a HMX unit with the specified locking type. + + #QURT_HMX_NON_SHARED_LOCK: + - If a HMX unit is available, lock the unit and return success of #QURT_EOK. + - If the HMX unit is already locked by another thread, the caller thread is suspended + until the HMX is available and gets locked by this function. + - If there is no HMX hardware supported, returns #QURT_EVAL; + + #QURT_HMX_SHARED_LOCK: + - If a HMX unit is available, enables HMX access for the caller thread, and returns + success of #QURT_EOK. + - If the HMX is enabled on the caller thread, return #QURT_EFAILED. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller + thread, and return success of #QURT_EOK. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED. + - If the HMX is locked by a thread from another user process different from the + user process of the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX lock successful.\n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_lock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_unlock2 + Unlocks a HMX unit with the unlocking type. + + #QURT_HMX_NON_SHARED_UNLOCK: + - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the + HMX accumulators (assuming a fixed point type). + - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + #QURT_HMX_SHARED_UNLOCK: + - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the + HMX access on the caller thread, and return success of #QURT_EOK. + Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK + in its user process, the unlock function clears the HMX accumulators. + - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return + failure of #QURT_EFAILED. + - If the caller thread has not locked HMX, return failure of #QURT_EFAILED. + - If there is no HMX hardware supported, returns #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX is unlocked successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_unlock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_lock + Locks a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away. + If there is no HMX unit available, the caller is blocked until a HMX is available + and is locked by the function. + + @return + #QURT_EOK -- HMX lock successful. \n + #QURT_EFAILED -- Failure due to wrong locking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_lock(void); + + +/**@ingroup func_qurt_hmx_unlock + Unlocks a HMX unit. + If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its + accumulators(assuming fixed point type). + If there is no HMX unit locked by the caller thread, return failure. + + @return + #QURT_EOK -- HMX unlock successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_unlock(void); + + +/**@ingroup func_qurt_hmx_try_lock + Tries to lock a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away; + if there is no HMX unit available, the function returns failure without blocking the caller. + + @return + #QURT_EOK -- HMX lock successful \n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_try_lock(void); + + +/**@ingroup func_qurt_hmx_assign + Assign a HMX unit to a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, + kernel will detach it from the thread, and re-assign it to the target thread. + If the target thread has HVX enabled, it cannot have HMX enabled. + + Locking type + #QURT_HMX_NON_SHARED_LOCK: + - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK. + - If the HMX unit is already enabled on the target thread, return #QURT_EOK. + - If the HMX unit is already locked by another thread, detach the HMX from the thread. + Re-assign the HMX unit to the target thread, and return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] type Locking type + #QURT_HMX_NON_SHARED_LOCK -- non-shared lock + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is assigned successfully. This includes the case that \n + the target thread already has HMX assigned. \n + #QURT_EFAILED -- Failure due to wrong assigning conditions. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit ); + + +/**@ingroup func_qurt_hmx_release + Release a HMX unit from a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + + Qurt detaches the specified HMX unit from the target thread, and return success of + #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is released successfully. This includes the case that \n + the target thread already has the HMX released. \n + #QURT_EFAILED -- Failure due to wrong assigning condition. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit ); + + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HMX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hvx.h new file mode 100755 index 0000000000000..13c213d49ac84 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hvx.h @@ -0,0 +1,421 @@ +#ifndef QURT_HVX_H +#define QURT_HVX_H +/** + @file qurt_hvx.h + @brief Prototypes of QuRT HVX API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @cond */ + +typedef enum { + QURT_HVX_MODE_64B = 0, /**< HVX mode of 64 bytes */ + QURT_HVX_MODE_128B = 1 /**< HVX mode of 128 bytes */ +} qurt_hvx_mode_t; +/** @endcond */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @cond internal_only*/ +/** @addtogroup hvx_macros +@{ */ +#define QURT_HVX_HW_UNITS_2X128B_4X64B 0x00000204 /**< Bits 15 through 8 are for the number of 128B units. */ + /**< Bits 7 through 0 are for the number of 64B units. */ +#define QURT_HVX_HW_UNITS_4X128B_0X64B 0x00000400 +#define QURT_HVX_HW_UNITS_6X128B_0X64B 0x00000600 + +/* HVX locking status */ + +#define QURT_HVX_UNLOCKED (0) /* Has not locked HVX unit */ +#define QURT_HVX_LOCKED (1) /* Has locked HVX unit */ +#define QURT_HVX_ERROR (-1) /* Error, no HVX support */ + +/* Input value for HVX reservation */ + +#define QURT_HVX_RESERVE_ALL (4) /* All the HVX units in terms of 64B_MODE are requested to be reserved */ +#define QURT_HVX_RESERVE_ALL_AVAILABLE (0xff) /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */ + +/* Return values for HVX reservation */ + +#define QURT_HVX_RESERVE_NOT_SUPPORTED (-1) /* There is no HVX hardware, or less units in the hardware than requested */ +#define QURT_HVX_RESERVE_NOT_SUCCESSFUL (-2) /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */ +#define QURT_HVX_RESERVE_ALREADY_MADE (-3) /* There is already a HVX reservation made. */ +#define QURT_HVX_RESERVE_CANCEL_ERR (-4) /* The action of cancling the reservation fails because this protection domain has no reservation made before. */ + +// HVX set requests + +#define QURT_HVX_64B 0 /**< */ +#define QURT_HVX_128B 1 /**< */ +#define QURT_HVX_NO_USE 2 /**< */ +#define QURT_HVX_RELEASE_CONTEXT 3 /**< */ +#define QURT_HVX_IMMEDIATE_USE 4 /**< */ + +// HVX set masks + +#define QURT_HVX_64B_PREFERRED (1<<(QURT_HVX_64B + 8))/**< */ +#define QURT_HVX_128B_PREFERRED (1<<(QURT_HVX_128B + 8))/**< */ +#define QURT_HVX_64B_ACCEPTABLE (1<<(QURT_HVX_64B + 12))/**< */ +#define QURT_HVX_128B_ACCEPTABLE (1<<(QURT_HVX_128B + 12))/**< */ + +// HVX set return "result" + +#define QURT_EOK 0 /**< */ +#define QURT_HVX_SET_ERROR 0xFF /**< */ + +// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE +#define QURT_HVX_64B_ASSIGNED (1<<(QURT_HVX_64B + 8)) /**< */ +#define QURT_HVX_128B_ASSIGNED (1<<(QURT_HVX_128B + 8)) /**< */ + +// Sizes of HVX dump buffer + +#define QURT_HVX_V65_64B_VSIZE 2084U /**< 64 x 32 + 8 x 4 + 4 (version). */ +#define QURT_HVX_V65_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V66_128B_VSIZE 4420U /**< 128 x (32 +2) + 16 x 4 + 4 (version). */ +#define QURT_HVX_V68_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V79_128B_VSIZE 4740U /**< 128 x (32+4+1) + 4 (version). */ +#define QURT_HVX_VREG_BUF_SIZE QURT_HVX_V79_128B_VSIZE /**< */ + +// HVX dump versions + +#define QURT_HVX_DUMP_V65_64B 1U /**< */ +#define QURT_HVX_DUMP_V65_128B 2U /**< */ +#define QURT_HVX_DUMP_V66_128B 3U /**< */ +#define QURT_HVX_DUMP_V68_128B 4U /**< */ +#define QURT_HVX_DUMP_V79_128B 5U /**< */ +/** @} */ /* end_addtogroup hvx_macros */ +/** @endcond */ +/** @cond */ +// Qurt data struct for hvx_set input +typedef struct qurt_hvx_set_struct_ { + unsigned char set_req; // LSB + struct { + unsigned char preferred_mask:4; + unsigned char acceptable_mask:4; + }; + unsigned short resvd; // MSB +} qurt_hvx_set_struct_t; // 4 bytes + + +// Qurt data struct for hvx_set return +typedef struct qurt_hvx_set_return_str_ { + unsigned char result; // LSB + unsigned char hvx_mode_assigned; + unsigned short resvd; // MSB +} qurt_hvx_set_return_struct_t; // 4 bytes +/** @endcond */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_hvx_lock + Locks one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns right away. + If the current HVX mode is different from the requested mode, the current + thread is blocked. When all HVX units become idle, QuRT changes + the mode, locks the HVX unit, and returns. + + Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is + mapped as qurt_hvx_set(64_BYTE or 128_BYTE). + + @datatypes + #qurt_mode_t + + @param[in] lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B. + + @return + #QURT_EOK -- Success \n + Other value -- Failure + + @dependencies + None. + + */ +int qurt_hvx_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_unlock + Unlocks the HVX unit held by this software thread. + + @note1hang Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock() + maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT). + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_unlock(void); + +/**@ingroup func_qurt_hvx_try_lock + Tries to lock one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns #QURT_EOK; Otherwise, + the function returns a failure, but does not block the current software + thread to wait for the HVX unit. + Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock() + maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask); + + @datatypes + #qurt_mode_t + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_get_mode + Gets the current HVX mode configured by QuRT. + + @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on + the current HVX configuration. + + @param[out] + None. + + @return + #QURT_HVX_MODE_128B \n + #QURT_HVX_MODE_64B \n + -1 -- Not available. + + @dependencies + None. + */ +int qurt_hvx_get_mode(void); + + +/**@ingroup func_qurt_hvx_get_units + Gets the HVX hardware configuration that the chipset supports. + + @note1hang The function returns the HVX hardware configuration supported by the chipset. + + @return + Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n + - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n + - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n + - 0 -- not available + + @dependencies + None. + + */ +int qurt_hvx_get_units(void); + + +/**@ingroup func_qurt_hvx_reserve + Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + If one HVX unit is already locked by the application in the same PD, the unit is + added to the returned count as one reserved unit for the PD. + Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve() + only does basic sanity checks on HVX units. + + @datatypes + None. + + @param[in] num_units Number of HVX units in terms of 64B_MODE to reserve for the PD. + QURT_HVX_RESERVE_ALL to reserve all the HVX units. + QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units. + + @return + Number of units successfully reserved, including the units already locked in the same PD. \n + #QURT_HVX_RESERVE_NOT_SUPPORTED \n + #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n + #QURT_HVX_RESERVE_ALREADY_MADE + + + @dependencies + None. + + */ +int qurt_hvx_reserve(int num_units); + + +/**@ingroup func_qurt_hvx_cancel_reserve + Cancels the HVX reservation in the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + + @return + 0 -- Success \n + #QURT_HVX_RESERVE_CANCEL_ERR -- Failure + + @dependencies + None. + + */ +int qurt_hvx_cancel_reserve(void); + + +/**@ingroup func_qurt_hvx_get_lock_val + Gets the HVX locking status value of the thread of the caller. + + @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not. + + @datatypes + None. + + @return + #QURT_HVX_UNLOCKED \n + #QURT_HVX_LOCKED \n + #QURT_HVX_ERROR + + @dependencies + None. + */ +int qurt_hvx_get_lock_val(void); + +/** @cond internal_only*/ +/**@ingroup func_qurt_hvx_set + Sets the HVX configuration for the software thread of the caller. + + @datatypes + None. + + @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask + | hvx_acceptable_mode_mask where set_request can be set to: \n + - #QURT_HVX_64B \n + - #QURT_HVX_128B \n + - #QURT_HVX_NO_USE \n + - #QURT_HVX_RELEASE_CONTEXT \n + - #QURT_HVX_IMMEDIATE_USE \n + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_preferred_mode_mask can be set to: \n + - #QURT_HVX_64B_PREFERRED \n + - #QURT_HVX_128B_PREFERRED + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_acceptable_mode_mask can be set to: \n + - #QURT_HVX_64B_ACCEPTABLE \n + - #QURT_HVX_128B_ACCEPTABLE @tablebulletend + + @return + Result of the HVX setting in the least significant 8 bits of the returned data. \n + #QURT_EOK -- 0 \n + #QURT_HVX_SET_ERROR -- 0xFF \n + When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, + bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n + - #QURT_HVX_64B_ASSIGNED \n + - #QURT_HVX_128B_ASSIGNED + + @dependencies + None. + */ +unsigned int qurt_hvx_set(unsigned int input_arg); + + +/**@ingroup func_qurt_system_hvx_regs_get_maxsize + Returns the maximum buffer size for saving HVX registers. + + @datatypes + None. + + @return + 0 -- No HVX supported in the target. \n + #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers. + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get_maxsize(void); + + +/**@ingroup func_qurt_system_hvx_regs_get_size + Returns the buffer size for saving HVX registers for a specified thread. + + @param[in] thread_id Thread ID of the target thread. + + @return + 0 -- No HVX assgined to the thread. \n + size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n + - #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + + @dependencies + None. + + */ +unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id); + + + +/**@ingroup func_qurt_system_hvx_regs_get + Saves the HVX registers into the specified buffer. + Returns the size of the data saved into the buffer. + After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer + from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0. + + @param[in] thread_id Thread ID of the target thread. + @param[in] pBuf Pointer to the buffer for HVX register saving. + The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from + the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. + For example, a buffer can be declared at first as: \n + unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n + unsigned char *pBuf; \n + then align the buffer pointer to: \n + pBuf = vbuf; \n + pBuf += (256 - 4 - (unsigned)pBuf%256); + @param[in] size Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that + returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above. + @param[out] pBuf Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith + byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes + contain one of the HVX dump versions:\n + - #QURT_HVX_DUMP_V65_64B \n + - #QURT_HVX_DUMP_V65_128B \n + - #QURT_HVX_DUMP_V66_128B \n + - #QURT_HVX_DUMP_V68_128B \n + - #QURT_HVX_DUMP_V79_128B \n + @tablebulletend + + @return + Total bytes of the data saved in the provided buffer. \n + 0 -- No HVX assigned to the thread \n + #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HVX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_int.h new file mode 100755 index 0000000000000..386aeda1051eb --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_int.h @@ -0,0 +1,509 @@ +#ifndef QURT_INT_H +#define QURT_INT_H +/** + @file qurt_int.h + @brief QuRT interrupt functions. + + + + Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/** @cond rest_reg_dist */ +/** @addtogroup interrupts_constants +@{ */ +#define SIG_INT_ABORT 0x80000000 /**< */ +#define QURT_INT_NON_DELAYED_ACK 0 +#define QURT_INT_DELAYED_ACK 1 +#define QURT_INT_ACK_DEFAULT QURT_INT_NON_DELAYED_ACK +#define QURT_INT_DRV_DEFAULT 0 +#define QURT_INT_PRIORITY_DEFAULT 0xFF + +/** QuRT interrupt property. */ +#define QURT_INT_CONFIGID_POLARITY 0x1U /**< */ +#define QURT_INT_CONFIGID_LOCK 0x2U /**< */ + +/** QuRT interrupt lock.*/ +#define QURT_INT_LOCK_DEFAULT 0x0 /**< Default. */ +#define QURT_INT_LOCK_DISABLE 0x0 /**< Interrupt can be enabled or disabled or deregistered. */ +#define QURT_INT_LOCK_ENABLE 0x1 /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/ +/** @} */ /* end_addtogroup interrupts_constants */ + +/** @addtogroup Qurt_interrupt_type +@{ */ +/** Trigger type bit fields for a PDC interrupt:\n + @verbatim + Polarity Edge Output\n + 0 00 Level sensitive active low + 0 01 Rising edge sensitive + 0 10 Falling edge sensitive + 0 11 Dual edge sensitive + 1 00 Level sensitive active high + 1 01 Falling edge sensitive + 1 10 Rising edge sensitive + 1 11 Dual edge sensitive + @endverbatim +*/ +#define QURT_INT_TRIGGER_TYPE_SET(pol, edge) ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */ + +#define QURT_INT_TRIGGER_LEVEL_LOW QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_LEVEL_HIGH QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_RISING_EDGE QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_FALLING_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_DUAL_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U) /**< */ +#define QURT_INT_TRIGGER_USE_DEFAULT 0xffU /**< */ +/** @} */ /* end_addtogroup Qurt_interrupt_type */ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_interrupt_register + @xreflabel{sec:interrupt_register} + Registers the interrupt.\n + Enables the specified interrupt and associates it with the specified QuRT signal object and + signal mask. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask); + +/**@ingroup func_qurt_interrupt_register2 + @xreflabel{sec:interrupt_register2} + Registers the interrupt.\n + Enables the specified interrupt, associates it with the specified QuRT signal object and + signal mask, and sets interrupt flags. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals that the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value #QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value #QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + #QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + @param[in] flags Defines interrupt property, supported property is interrupt lock enable/disable. + Possible values for flags: \n + - #QURT_INT_LOCK_ENABLE + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags); +/* + * Waits for registered interrupt signal + + * Suspend the current thread until one of its registered interrupts occurs. The second input mask, + * contains the interrupt signals the IST expects to receive. The interrupt signals are registered + * with interrupts via qurt_register_interrupt API. + * + * The signals returned in the signal variable indicate which interrupts occurred. Use function + * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to + * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST + * must quit from interrupt receiving loop. + * + * For detail information on this API, see QuRT User Manual Section 4.2.5 + * + * Prototype + * + * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask) + */ + +/**@ingroup func_qurt_interrupt_acknowledge + Acknowledges an interrupt after it has been processed.\n + Re-enables an interrupt and clears its pending status. This is done after an interrupt is + processed by an IST. + + Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST + performs the acknowledge operation after it has finished processing the interrupt and + just before suspending itself (such as by waiting on the interrupt signal). + + @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt, + an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before + acknowledging the interrupt. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Interrupt acknowledge was successful. \n + #QURT_EDEREGISTERED -- Interrupt is already de-registered. + + @dependencies + None. +*/ +int qurt_interrupt_acknowledge(int int_num); + +/**@ingroup func_qurt_interrupt_deregister + Disables the specified interrupt and disassociates it from a QuRT signal object. + If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation + returns the status value #QURT_EINT. + + @note1hang If an interrupt is deregistered while an IST waits + to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid + this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an + IST after determining that it has no interrupts registered. + + @param[in] int_num L2VIC to deregister; valid range is 0 to 1023. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number (not registered). + + @dependencies + None. + +*/ +unsigned int qurt_interrupt_deregister(int int_num); +/** @endcond */ + +/**@ingroup func_qurt_interrupt_disable + Disables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + After qurt_interrupt_disable() returns, the Hexagon subsystem + can no longer send the corresponding interrupt to the Hexagon + core, until qurt_interrupt_enable() is called + for the same interrupt. + + Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within + a short period of time.\n + - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() + is called. Therefore, some time later, the pending interrupt is received on a Hexagon + hardware thread.\n + - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon + hardware automatically disables the interrupt until kernel software re-enables the interrupt + at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain + thread at an ealier time, the interrupt is re-enabled earlier and can trigger + sending a new interrupt to the Hexagon core while kernel software is still processing + the previous interrupt. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully disabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_disable(int int_num); + + +/**@ingroup func_qurt_interrupt_enable + Enables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully enabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. + +*/ + unsigned int qurt_interrupt_enable(int int_num); + + +/**@ingroup func_qurt_interrupt_status + Returns a value that indicates the pending status of the specified interrupt. + + @param[in] int_num Interrupt number that is being checked. + @param[out] status Interrupt status; 1 indicates that an interrupt is + pending, 0 indicates that an interrupt is not pending. + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_status(int int_num, int *status); + + +/**@ingroup func_qurt_interrupt_get_status + Gets the status of the specified interrupt in L2VIC. + + @param[in] int_num Interrupt number that is being checked. + @param[in] status_type 0 -- interrupt pending status \n + 1 -- interrupt enabling status + @param[out] status 0 -- OFF \n + 1 -- ON + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status); + +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_interrupt_clear + Clears the pending status of the specified interrupt. + + @note1hang This operation is intended for system-level use, and must be used with care. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_clear(int int_num); + + +/**@ingroup func_qurt_interrupt_get_config + Gets the L2VIC interrupt configuration. \n + This function returns the type and polarity of the specified L2VIC interrupt. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[out] int_type Pointer to an interrupt type. \n + 0 -- Level-triggered interrupt \n + 1 -- Eedge-triggered interrupt + @param[out] int_polarity Pointer to interrupt polarity.\n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt. + + @return + #QURT_EOK -- Configuration successfully returned.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity); + +/**@ingroup func_qurt_interrupt_set_config + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang Deregister L2VIC interrupts before reconfiguring them. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Interrupt type. \n + 0 -- Level-triggered interrupt\n + 1 -- Edge-triggered interrupt + @param[in] int_polarity Interrupt polarity. \n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity); + +/**@ingroup func_qurt_interrupt_set_config2 + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Notified to the hardware configuration callback function and used to + modify the L2VIC type. Possible values: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type); + +/**@ingroup func_ qurt_interrupt_set_config3 + Sets the specified configuration value for the specified property of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity. + + @param[in] int_num L2VIC interrupt to re-enable. + @param[in] config_id Property to configure: \n + - #QURT_INT_CONFIGID_POLARITY \n + - #QURT_INT_CONFIGID_LOCK @tablebulletend + @param[in] config_val Dependent on the second argument config_id, specifies the value to set. \n + Values for #QURT_INT_CONFIGID_POLARITY: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE \n + + Values for #QURT_INT_CONFIGID_LOCK: \n + - #QURT_INT_LOCK_ENABLE\n + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. +*/ +unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val); + + +/**@ingroup func_qurt_interrupt_raise + Raises the interrupt. \n + This function triggers a level-triggered L2VIC + interrupt, and accepts interrupt numbers in the range of 0 to 1023. + + @param[in] interrupt_num Interrupt number. + + @return + #QURT_EOK -- Success \n + -1 -- Failure; the interrupt is not supported. + + @dependencies + None. + */ +int qurt_interrupt_raise(unsigned int interrupt_num); + +/**@ingroup func_qurt_interrupt_raise2 + Raises the interrupt and returns the current pcycle value. + + @param[in] interrupt_num Interrupt number. + + @return + 0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n + Other value -- pcycle count at the time the interrupt is raised. + + @dependencies + None. + */ +unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_isr_subcall + Indicates whether the current function is called from a callback procedure (either short or long). + + @return + #QURT_EOK -- TRUE \n + #QURT_EVAL -- FALSE. + + @dependencies + None. + */ +int qurt_isr_subcall(void); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_INT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_island.h new file mode 100755 index 0000000000000..f0c8ee27cf8b0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_island.h @@ -0,0 +1,122 @@ +#ifndef QURT_ISLAND_H +#define QURT_ISLAND_H + +/** + @file qurt_island.h + @brief Prototypes of power API + The APIs allow entering and exiting island mode where the memory + accesses are limited to local memory. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_island_get_status + Gets Island mode status. + + Returns a value that indicates whether the QuRT system executes in Island mode. + + @return + 0 - Normal mode. \n + 1 - Island mode. + + @dependencies + None. +*/ +unsigned int qurt_island_get_status (void); + +/**@ingroup func_qurt_island_get_status2 + Gets Island mode status especially that differentiates between island partial exit and complete exit. + + Returns a value that indicates the current state. + + @note1hang Transition from NORMAL mode to ISLAND mode happens in single + threaded mode. Whereas transition from ISLAND mode to other modes + happen in multi-threaded mode. So, a thread that gets island mode + status as NORMAL can assume the same status till it continues to + run. A thread that gets island mode status as ISLAND should + assume that the status may change to EXITING or NORMAL while it + runs. A thread that gets island mode status as EXITING should + assume that the status may change to NORMAL while it runs. If + the thread goes to wait state in after reading the status, it should get + the island mode state again and not assume the previous state. + @note2hang This api returns more intrinsic states than qurt_island_get_status, + when qurt_island_get_status returns 0, this api could return + QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND + + @param[in/out] data field is reserved for future use. If NULL pointer is passed, + the field will be ignored. If a valid pointer is passed, + QuRT will return back a bitmask which can be interpreted as follows: + data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. + Otherwise set to 0. + data[30:0] – Reserved for future definition. + + @return + QURT_ISLAND_MODE_NORMAL - Main mode \n + QURT_ISLAND_MODE_ISLAND - Island mode \n + QURT_ISLAND_MODE_EXITING - Exiting Island mode \n + + @dependencies + None. +*/ +unsigned int qurt_island_get_status2 (unsigned int *data); + + + +/**@ingroup func_qurt_island_get_exit_status + Gets the reason for the last Island mode exit status. + + @param[out] cause_code Pointer that returns the cause code of the last + island exit reason. \n + - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n + - #QURT_ENOISLANDENTRY -- API called before exiting island. \n + - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend + + @param[out] int_num Pointer that holds the invalid interrupt number that caused + island exit when the cause code is #QURT_EISLANDINVALIDINT. + For other cases, it is -1. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num); + +/**@ingroup func_qurt_island_get_enter_timestamp + Gets the recent timestamp when the system exits STM during island enter. + + @param[out] island_enter_timestamp Returns a pointer to the recent timestamp + recorded after the system exits STM during island enter. If the system never + attempts to enter island, the island_enter_timestamp return pointer holds a value + of zero. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISLAND_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_isr.h new file mode 100755 index 0000000000000..db29ea2f265d7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_isr.h @@ -0,0 +1,177 @@ +#ifndef QURT_ISR_H +#define QURT_ISR_H + +/*===================================================================== + + @file qurt_isr.h + + @brief Prototypes of Qurt ISR API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2017, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + Functions +=============================================================================*/ + + +/**@ingroup func_qurt_isr_set_hw_config_callback + Set callback function for the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_config_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_enable_callback + Set callback function for enabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_enable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_disable_callback + Set callback function for disabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_disable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_create + Creates an ISR thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + + @return + #QURT_EVAL -- Invalid arguments + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr); + +/**@ingroup func_qurt_isr_register2 + Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes. + The interrupt is enabled when this function returns success. + + @datatypes + qurt_thread_t + + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create() + @param[in] int_num The interrupt number + @param[in] prio Priority of the ISR + @param[in] flags Defines ACK type. Values : \n + QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the Kernel. + QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + @param[in] int_type. Notifies it to registered function. Values: \n + - QURT_INT_TRIGGER_USE_DEFAULT + - QURT_INT_TRIGGER_LEVEL_HIGH + - QURT_INT_TRIGGER_LEVEL_LOW + - QURT_INT_TRIGGER_RISING_EDGE + - QURT_INT_TRIGGER_FALLING_EDGE + - QURT_INT_TRIGGER_DUAL_EDGE + @param[in] isr Interrupt Service Routine with proto type void isr (void *arg, int int_num) + @param[in] arg 1st argument of the ISR when it is called to service the interrupt + + @return + QURT_EOK -- Successfully registered the ISR for the interrupt + QURT_EINT -- Interrupt not configured + QURT_EINVALID -- Invalid Thread ID + QURT_EDISABLED -- The feature is disabled + QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_isr_deregister2 + De-registers the ISR for the specified interrupt. + The interrupt is disabled when this function returns success. + + @param[in] int_num The interrupt number + + @return + QURT_EOK -- ISR deregistered successfully + QURT_ENOREGISTERED -- Interrupt with int_num is not registered + + @dependencies + None. + */ +int qurt_isr_deregister2 (int int_num); + +/**@ingroup func_qurt_isr_delete + ISR thread will exit and releases Kernel resources + + @note1hang The ISR thread shouldn't be actively processing interrupts, + otherwise the call will fail and return an error. + + @param[in] thread-id of the ISR thread that needs to be deleted. + + @return + QURT_ENOTALLOWED -- ISR thread is processing an interrupt + QURT_EINVALID -- Invalid ISR thread ID + QURT_EOK -- Success + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_delete (qurt_thread_t isr_tid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISR_H */ + + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_l2cfg.h new file mode 100755 index 0000000000000..7e26b30a580d9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_l2cfg.h @@ -0,0 +1,98 @@ +#ifndef QURT_L2CFG_H +#define QURT_L2CFG_H +/** + @file qurt_l2cfg.h + @brief QuRT APIs for L2 configuration and system configuration + +EXTERNAL FUNCTIONS + qurt_l2cfg_set + qurt_l2cfg_get + qurt_system_config_get + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +/* Definition for system configuration */ +/** @addtogroup l2cfg_macros +@{ */ +#define QURT_CORE_CFG_HMX_INT8_SPATIAL 0x78 /**< HMX fixed-point spatial size */ +#define QURT_CORE_CFG_HMX_INT8_DEPTH 0x7C /**< HMX fixed-point output depth */ +/** @} */ /* end_addtogroup l2cfg_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_l2cfg_set + Sets the value of a L2 configuration register. A register can be set *IFF* its + initial value is configured. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[in] value Value to set the register to. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely + a configuration problem. \n + #QURT_EINVALID -- Argument error. \n + #QURT_ENOTALLOWED -- Setting this register is prohibited. + + @dependencies + None. + */ +int qurt_l2cfg_set (unsigned short offset, unsigned int value); + +/**@ingroup func_qurt_l2cfg_get + Gets the value of a L2 configuration register. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[out] value Pointer to value of the register. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; + likely a configuration problem. \n + #QURT_EINVALID -- Argument error. + + @dependencies + None. + + */ +int qurt_l2cfg_get (unsigned short offset, unsigned int * value); + + +/**@ingroup func_qurt_system_config_get + Gets the system configuration information. + + @param[in] index Index to system configuration. Values:\n + - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n + - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend + + @param[out] data Pointer to a word for returned data. + + @return + #QURT_EOK -- Get the configuration data successful. \n + Other values -- Failure (no such configuration available). + + @dependencies + None. + + */ +int qurt_system_config_get(int index, unsigned int *data); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_L2CFG_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_lifo.h new file mode 100755 index 0000000000000..dc399fccc5f0f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_lifo.h @@ -0,0 +1,71 @@ +#ifndef QURT_LIFO_H +#define QURT_LIFO_H +/** + @file qurt_lifo.h + + @brief + Provide lock free LastInFirstOut algorithm, which can be used in a + variety of situations for allocation/free fixed size buffer + This implementation touches the first word of your FREED buffer. Even + though it does not matter how you use it when it is allocated, you might want + to be a bit careful not to put your MAGIC number as the first field. + Because it will not hold the magic value for "freed" + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ +/** + Pops an element out of the LIFO. + + @param[in] freelist Pointer to the head of your list. + + @return + Top object from the list + + @dependencies + None. +*/ +/* ======================================================================*/ +void * qurt_lifo_pop(void *freelist); + + +/*======================================================================*/ +/** + Pushes an element into the LIFO. + + @param[in] freelist Pointer to the head of your list. + @param[in] buf Pointer to your buffer to push into the list. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_lifo_push(void *freelist, void *buf); + +void qurt_lifo_remove(void *freelist, void *buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_LIFO_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mailbox.h new file mode 100755 index 0000000000000..a6cd91c611782 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mailbox.h @@ -0,0 +1,176 @@ +#ifndef QURT_MAILBOX_H +#define QURT_MAILBOX_H + +/** + @file qurt_mailbox.h + @brief Definitions, macros, and prototypes used for QuRT mailbox + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* Definitions on typedef and return values */ + +#define QURT_MAILBOX_ID_NULL 0 +#define QURT_MAILBOX_ERROR -1 +#define QURT_MAILBOX_ID_ERROR -2 +#define QURT_MAILBOX_NON_VALID_DATA -3 +#define QURT_MAILBOX_FULL -4 +#define QURT_MAILBOX_DELETED -5 +#define QURT_MAILBOX_RECEIVE_HALTED -6 +#define QURT_MAILBOX_BANDWIDTH_LIMIT -7 + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ + +#define QURT_MAILBOX_AT_QURTOS 0U // Receiver is QurtOS +#define QURT_MAILBOX_AT_ROOTPD 1U // Receiver is RootPD (ASID=0) +#define QURT_MAILBOX_AT_USERPD 2U // Receiver is User PD (ASID!=0) +#define QURT_MAILBOX_AT_SECUREPD 3U // Receiver is Secure PD + +typedef unsigned char qurt_mailbox_receiver_cfg_t; + +#define QURT_MAILBOX_SEND_OVERWRITE 0U // When there is already valid content, overwrite it +#define QURT_MAILBOX_SEND_NON_OVERWRITE 1U // When there is already valid content, return failure + +typedef unsigned char qurt_mailbox_send_option_t; + + +#define QURT_MAILBOX_RECV_WAITING 0U // When there is no valid content, wait for it +#define QURT_MAILBOX_RECV_NON_WAITING 1U // When there is no valid content, return failure immediately +#define QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U // Read the content, but doesn't remove it from the mailbox. No waiting. + +typedef unsigned char qurt_mailbox_recv_option_t; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/* Function prototype */ + +/**@ingroup qurt_mailbox_create + Creates a QuRT mailbox. + + @param name Mailbox name up to 8 characters. + @param recv_opt Configuration on the receiver process. + + @return + Mailbox ID -- Mailbox Identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at creating mailbox + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt); + + +/**@ingroup qurt_mailbox_get_id + Gets a QuRT mailbox identifier. + + @param name Mailbox name up to 8 characters. + + @return + Mailbox ID -- Mailbox identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_get_id(char *name); + + +/**@ingroup qurt_mailbox_send + Sends data to a QuRT mailbox. + + @param mailbox_id Mailbox identifier. + @param send_opt Option for mailbox send. + @param data Data to send. + + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors.\n + #QURT_MAILBOX_FULL Valid data already exists, non-overwriting.\n + #QURT_MAILBOX_BANDWIDTH_LIMIT Reached the bandwidth limitation. + + @dependencies + None. +*/ +int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data); + + +/**@ingroup qurt_mailbox_receive + Receive data from QuRT mailbox + + @param mailbox_id Mailbox Identifier + @param send_opt Option for mailbox receiving + @param data Pointer to data buffer for receiving + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. \n + #QURT_MAILBOX_NON_VALID_DATA No current valid data, put the previous content in the buffer. \n + #QURT_MAILBOX_RECEIVE_HALTED Receive halted, the waiting thread is woken up. \n + #QURT_MAILBOX_DELETED Mailbox is deleted, and the waiting thread is woken up. + + @dependencies + None. +*/ +int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data); + + +/**@ingroup qurt_mailbox_delete + Deletes a QuRT mailbox. + + A mailbox can only be deleted from the process that created the mailbox. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_delete(unsigned long long mailbox_id); + + +/**@ingroup qurt_mailbox_receive_halt + Halts a QuRT mailbox receiving and wakes up waiting threads. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_receive_halt(unsigned long long mailbox_id); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_MAILBOX_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_memory.h new file mode 100755 index 0000000000000..90ce2586fec50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_memory.h @@ -0,0 +1,1487 @@ +#ifndef QURT_MEMORY_H +#define QURT_MEMORY_H +/** + @file qurt_memory.h + @brief Prototypes of kernel memory API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include +//#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup memory_management_macros +@{ */ +#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all + processes.*/ +/** @} */ /* end_addtogroup memory_management_macros */ +/**@cond rest_reg_dist */ +/** @addtogroup memory_management_types +@{ */ +/** @xreflabel{hdr:qurt_mem_default_pool} */ +extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/ +/** @} */ /* end_addtogroup memory_management_types */ + +/** @cond rest_reg_dist */ +/** Mapping attribute information*/ +typedef struct{ + qurt_paddr_64_t paddr; + qurt_size_t size ; + qurt_mem_cache_mode_t cache_mode; + qurt_perm_t perms ; +}qurt_mapping_attr_t; +/** @endcond */ +/** @} */ /* end_addtogroup mapping_attribute_types*/ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_mem_cache_clean + Performs a cache clean operation on the data stored in the specified memory area. + Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater. + + @note1hang Perform the flush all operation only on the data cache. + + @note1cont This operation flushes and invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed and invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_ALL\n + @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend + @param[in] type Cache type. Values: + - #QURT_MEM_ICACHE + - #QURT_MEM_DCACHE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type.\n + + @dependencies + None. +*/ +int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_clean2 + Performs a data cache clean operation on the data stored in the specified memory area. + + This API only performs the following data cache operations:\n + - #QURT_MEM_CACHE_FLUSH\n + - #QURT_MEM_CACHE_INVALIDATE\n + - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed/invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values:\n #QURT_MEM_CACHE_FLUSH\n #QURT_MEM_CACHE_INVALIDATE\n + #QURT_MEM_CACHE_FLUSH_INVALIDATE + @param[in] type Cache type. Values: \n #QURT_MEM_DCACHE + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type. + + @dependencies + None. +*/ +int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_phys_clean + Performs a cache clean operation on the data stored in the specified memory area based on address match and mask. + Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch. + + @note1hang The addrmatch value should be the upper 24-bit physical address to match against. + + @datatypes + #qurt_mem_cache_op_t \n + + @param[in] mask 24-bit address mask. + @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid operation + + @dependencies + None. +*/ + +int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode); + +/**@ingroup func_qurt_mem_l2cache_line_lock + Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory. + + @note1hang Perform the line lock operation only on the 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to lock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success.\n + #QURT_EALIGN -- Data alignment or address failure. + #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size) + #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address + in the range of addr and addr+size or the address range is not L2 cacheable + @dependencies + None. +*/ +int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_l2cache_line_unlock + Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory. + + @note1hang Perform the line unlock operation only on a 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to unlock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success. \n + #QURT_EALIGN -- Aligning data or address failure. \n + #QURT_EFAILED -- Operation failed, cannot find the matching tag. + + @dependencies + None. +*/ +int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_region_attr_init + @xreflabel{sec:qurt_mem_region_attr_init} + Initializes the specified memory region attribute structure with default attribute values: \n + - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n + - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n + - Physical address -- -1 \n + - Virtual address -- -1 \n + - Memory type -- #QURT_MEM_REGION_LOCAL \n + - Size -- -1 + + @note1hang The memory physical address attribute must be explicitly set by calling the + qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly + as parameters in the memory region create operation. + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the destination structure for the memory region attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attach + Initializes a memory pool object to attach to a pool predefined in the system + configuration file. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. They are specified in memory region create operations + (Section @xref{sec:mem_region_create}). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach + operation is necessary only when allocating memory regions in nonstandard + memory units such as TCM. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_attach2 + Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL. + The client_handle is used to look up the client specific pool. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. Memory pool objects are specified during mapping creation operations + (qurt_mem_mmap() and qurt_mem_region_create()). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2 + operation is necessary only when allocating memory regions in memory units specific to the client. + + @datatypes + #qurt_mem_pool_t + + @param[in] client_handle Client identifier used by the OS to lookup the identifier + for client specific pool + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_create + @xreflabel{hdr:qurt_mem_pool_create} + Dynamically creates a memory pool object from a physical address range. + + The pool is assigned a single memory region with the specified base address and size. + + The base address and size values passed to this function must be aligned to 4K byte + boundaries, and must be expressed as the actual base address and size values divided by 4K. + + For example, the function call: + @code + qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool) + @endcode + ... is equivalent to the following static pool definition in the QuRT system configuration file: + @code + + + + @endcode + + @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond + + @note1hang Dynamically created pools are not identical to static pools. In particular, + qurt_mem_pool_attr_get() is not valid with dynamically created pools. + + @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[in] base Base address of the memory region (divided by 4K). + @param[in] size Size (in bytes) of the memory region (divided by 4K). + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_add_pages + Adds a physical address range to the specified memory pool object.\n + + @note1hang Call this operation only with root privileges (guest OS mode). + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_mem_pool_add_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages); + +/**@ingroup func_qurt_mem_pool_remove_pages + Removes a physical address range from the specified memory pool object. + + If any part of the address range is in use, this operation returns an + error without changing the state. + + @note1hang Call this operation only with root privileges (guest-OS mode). + + @note1cont In the future, this operation will support (via the flags parameter) the + removal of a physical address range when part of the range is in use. + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + @param[in] flags Remove options. Values: \n + - 0 -- Skip holes in the range that are not part of the pool (default) \n + - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified + physical address range is entirely contained (with no holes) in the + pool free space. @tablebulletend + @param[in] callback Callback procedure called when pages were successfully removed. + Not called if the operation failed. Passing 0 as the parameter + value causes the callback to not be called. + @param[in] arg Value passed as an argument to the callback procedure. + + @return + #QURT_EOK -- Pages successfully removed. + + @dependencies + None. +*/ +int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages, + unsigned flags, + void (*callback)(void *), + void *arg); +/**@ingroup memory_management_types*/ +#define QURT_POOL_REMOVE_ALL_OR_NONE 1 /**< */ + +/**@ingroup func_qurt_mem_pool_attr_get + Gets the memory pool attributes. \n + Retrieves pool configurations based on the pool handle, and fills in + the attribute structure with configuration values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_attr_t + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[out] attr Pointer to the memory region attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attr_get_size + Gets the size of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] size Pointer to the destination variable for the range size. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*size) = 0; + return QURT_EINVALID; + } + else { + (*size) = attr->ranges[range_id].size; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr + Gets the start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; + } + else { + (*addr) = (attr->ranges[range_id].start)<<12; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr_64 + Gets the 64 bit start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_64_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){ +if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; +} +else { + (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12; + } + return QURT_EOK; + } + + +/**@ingroup func_qurt_mem_pool_status_get + Gets the memory pool status. \n + Based on the pool handle, retrieves largest contiguous free memory, + total free memory, and total memory declared for the pool in bytes. Fills in + the memory status structure with the values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_status_t + + @param[in] pool Pool handle. + @param[out] status Pointer to the memory pool status structure. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status); + + +/**@ingroup func_qurt_mem_pool_is_available + Checks whether the number of pages that the page_count argument indicates + can be allocated from the specified pool. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_mem_mapping_t \n + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[in] page_count Number of 4K pages. + @param[in] mapping_type Variable of type qurt_mem_mapping_t. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Mapping_type is invalid. \n + #QURT_EMEM -- Specified pages cannot be allocated from the pool. + + @dependencies + None. +*/ +int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type); + + +/**@ingroup func_qurt_mem_region_create + @xreflabel{sec:mem_region_create} + Creates a memory region with the specified attributes. + + The application initializes the memory region attribute structure with + qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr(). + + If the virtual address attribute is set to its default value + (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is + automatically assigned any available virtual address value. + + If the memory mapping attribute is set to virtual mapping, the physical address of the memory region + is also automatically assigned.\n + + @note1hang The physical address attribute is explicitly set in the attribute structure only + for memory regions with physical-contiguous-mapped mapping. + + Memory regions are always assigned to memory pools. The pool value specifies the memory pool + that the memory region is assigned to. + + @note1hang If attr is specified as NULL, the memory region is created with default + attribute values (Section @xref{sec:qurt_mem_region_attr_init}). + QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory. + + @datatypes + #qurt_mem_region_t \n + #qurt_size_t \n + #qurt_mem_pool_t \n + #qurt_mem_region_attr_t + + @param[out] region Pointer to the memory region object. + @param[in] size Memory region size (in bytes). If size is not an integral multiple of 4K, + it is rounded up to a 4K boundary. + @param[in] pool Memory pool of the region. + @param[in] attr Pointer to the memory region attribute structure. + + @return + #QURT_EOK -- Memory region successfully created.\n + #QURT_EMEM -- Not enough memory to create region. + #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute. + + @dependencies + None. +*/ +int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_region_delete + Deletes the specified memory region. + + If the caller application creates the memory region, it is removed and the system reclaims its + assigned memory. + + If a different application creates the memory region (and is shared with the caller + application), only the local memory mapping to the region is removed; the system does + not reclaim the memory. + + @datatypes + #qurt_mem_region_t + + @param[in] region Memory region object. + + @returns + #QURT_EOK -- Region successfully deleted. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. +*/ +int qurt_mem_region_delete(qurt_mem_region_t region); + + +/**@ingroup func_qurt_mem_region_attr_get + @xreflabel{sec:mem_region_attr_get} + Gets the memory attributes of the specified message region. + After a memory region is created, its attributes cannot be changed. + + @datatypes + #qurt_mem_region_t \n + #qurt_mem_region_attr_t + + @param[in] region Memory region object. + @param[out] attr Pointer to the destination structure for memory region attributes. + + @return + #QURT_EOK -- Operation successfully performed. \n + Error code -- Failure. + + @dependencies + None. +*/ +int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr); + + +/**@ingroup func_qurt_mem_region_attr_set_type + Sets the memory type in the specified memory region attribute structure. + + The type indicates whether the memory region is local to an application or shared between + applications. + @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in,out] attr Pointer to memory region attribute structure. + @param[in] type Memory type. Values: \n + - #QURT_MEM_REGION_LOCAL \n + - #QURT_MEM_REGION_SHARED @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){ + attr->type = type; +} + +/**@ingroup func_qurt_mem_region_attr_get_size + Gets the memory region size from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] size Pointer to the destination variable for memory region size. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){ + (*size) = attr->size; +} + +/**@ingroup func_qurt_mem_region_attr_get_type + Gets the memory type from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] type Pointer to the destination variable for the memory type. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){ + (*type) = attr->type; +} + +/**@ingroup func_qurt_mem_region_attr_set_physaddr + Sets the memory region 32-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise QuRT automatically sets it + when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){ + attr->ppn = (unsigned)(((unsigned)(addr))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr + Gets the memory region physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for memory region physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_region_attr_set_virtaddr + Sets the memory region virtual address in the specified memory attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_addr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){ + attr->virtaddr = addr; +} + +/**@ingroup func_qurt_mem_region_attr_get_virtaddr + Gets the memory region virtual address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for the memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned int)(attr->virtaddr); +} + +/**@ingroup func_qurt_mem_region_attr_set_mapping + Sets the memory mapping in the specified memory region attribute structure. + + The mapping value indicates how the memory region is mapped in virtual memory. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mapping Mapping. Values: + - #QURT_MEM_MAPPING_VIRTUAL + - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS + - #QURT_MEM_MAPPING_IDEMPOTENT + - #QURT_MEM_MAPPING_VIRTUAL_FIXED + - #QURT_MEM_MAPPING_NONE + - #QURT_MEM_MAPPING_VIRTUAL_RANDOM + - #QURT_MEM_MAPPING_INVALID @tablebulletend + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){ + attr->mapping_type = mapping; +} + +/**@ingroup func_qurt_mem_region_attr_get_mapping + Gets the memory mapping from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mapping Pointer to the destination variable for memory mapping. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){ + (*mapping) = attr->mapping_type; +} + +/**@ingroup func_qurt_mem_region_attr_set_cache_mode + Sets the cache operation mode in the specified memory region attribute structure. + + @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mode Cache mode. Values: \n + - #QURT_MEM_CACHE_WRITEBACK \n + - #QURT_MEM_CACHE_WRITETHROUGH\n + - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n + - #QURT_MEM_CACHE_NONE @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){ + QURT_PGATTR_C_SET(attr->pga, (unsigned)mode); +} + +/**@ingroup func_qurt_mem_region_attr_get_cache_mode + Gets the cache operation mode from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){ + unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga); + (*mode) = (qurt_mem_cache_mode_t)mode_temp; +} + +/**@ingroup func_qurt_mem_region_attr_set_bus_attr + Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure. + + @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] abits The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){ + QURT_PGATTR_A_SET(attr->pga, abits); +} + +/**@ingroup func_qurt_mem_region_attr_get_bus_attr + Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] pbits Pointer to an unsigned integer that is filled in with + the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){ + (*pbits) = QURT_PGATTR_A_GET(attr->pga); +} + +void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle); +void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle); +void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms); +void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms); + +/**@ingroup func_qurt_mem_map_static_query + Determines whether a memory page is statically mapped. + Pages are specified by the following attributes: physical address, page size, cache mode, + and memory permissions. \n + - If the specified page is statically mapped, vaddr returns the virtual + address of the page. \n + - If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + The system configuration file defines QuRT memory maps. + + @datatypes + #qurt_addr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr Physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n + #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + + +/**@ingroup func_qurt_mem_region_query + Queries a memory region. \n + This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. + When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_paddr_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr Physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Query successfully performed. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr); + + +/**@ingroup func_qurt_mapping_create + @xreflabel{hdr:qurt_mapping_create} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Mapping created. \n + #QURT_EMEM -- Failed to create mapping. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove + @xreflabel{hdr:qurt_mapping_remove} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Mapping created. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr + Translates a virtual memory address to the physical memory address to which it maps. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the + physical address of another process. + + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- Physical address to which the virtual address is mapped.\n + 0 -- Virtual address not mapped. + + @dependencies + None. +*/ +qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr); + +/**@ingroup func_qurt_mem_region_attr_set_physaddr_64 + Sets the memory region 64-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise it is automatically set by + QuRT when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr_64 Memory region 64-bit physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){ + attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr_64 + Gets the memory region 64-bit physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr_64 Pointer to the destination variable for the memory region 64-bit physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){ + (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_map_static_query_64 + Determines if a memory page is statically mapped. + The following attributes specify pages: 64-bit physical address, page size, cache mode, + and memory permissions. \n + If the specified page is statically mapped, vaddr returns the virtual + address of the page. + If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + QuRT memory maps are defined in the system configuration file. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr_64 64-bit physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n + #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mem_region_query_64 + Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr_64 64-bit physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64); + +/**@ingroup func_qurt_mapping_create_64 + @xreflabel{hdr:qurt_mapping_create_64} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Failure. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove_64 + @xreflabel{hdr:qurt_mapping_remove_64} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Success. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr_64 + Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical + address of another process. + + @datatypes + #qurt_paddr_64_t \n + #qurt_addr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address has not been mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_mapping_reclaim + Deallocates all QuRT resources associated with the specified virtual + memory area, making it available for user memory management:\n + - The associated physical memory areas are freed and added to the + specified physical pool.\n + - The associated TLB entries are deleted and made available for TLB + management.\n + - The virtual memory area is not freed -- it is left in + place as allocated, but unmapped virtual memory. Access to this + memory area generates an exception.\n + + The virtual memory area must be statically allocated. + If no pool is specified, the freed physical memory is not added to any pool. + + @note1hang The virtual memory area is restricted to being filled with locked + TLB entries that are contiguous within the memory area, and contained by it. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_pool_t + + @param[in] vaddr Virtual address of the memory area to free. + @param[in] vsize Size (in bytes) of the memory area to free. + @param[in] pool Handle to the physical pool where freed physical memory is added. + If set to 0, freed physical memory is not added to any pool. + + @return + 0 -- Success. \n + Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that + QuRT logs messages related to the failure, and callers are free to ignore the return value. + + @dependencies + None. +*/ +int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_mem_configure_cache_partition + Configures the Hexagon cache partition at the system level. + + A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache. + + The L1 cache partition is not supported in Hexagon processor version V60 or greater. + + @note1hang Call this operation only with QuRT OS privilege. + + @datatypes + #qurt_cache_type_t \n + #qurt_cache_partition_size_t + + @param[in] cache_type Cache type for partition configuration. Values: \n + - #HEXAGON_L1_I_CACHE \n + - #HEXAGON_L1_D_CACHE \n + - #HEXAGON_L2_CACHE @tablebulletend + + @param[in] partition_size Cache partition size. Values: \n + - #FULL_SIZE \n + - #HALF_SIZE \n + - #THREE_QUARTER_SIZE \n + - #SEVEN_EIGHTHS_SIZE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Error. + + @dependencies + None. + */ +int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size); + + +/**@ingroup func_qurt_mem_syncht + @xreflabel{hdr:qurt_mem_syncht} + Performs heavy-weight synchronization of memory transactions. + + This operation does not return until all previous memory transactions (cached and uncached load/store, + mem_locked, and so on) that originated from the current thread are complete and globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_syncht(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" SYNCHT \n"); + #endif +} + +/**@ingroup func_qurt_mem_barrier + @xreflabel{hdr:qurt_mem_barrier} + Creates a barrier for memory transactions. + + This operation ensures that all previous memory transactions are globally observable before any + future memory transactions are globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction. + @return + None + + @dependencies + None. + */ +static inline void qurt_mem_barrier(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" BARRIER \n"); + #endif +} +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_system_mem_alloc + Requests that the kernel allocates memory from the kernel-owned pool. + + @param[in] size Size in bytes (aligned to 4K) to allocate. + @param[in] align Any alignment that must be considered for the allocation. + @param[in] flags Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates + available virtual memory in the address space of all processes. + + @return + #QURT_EFATAL -- Allocation failed \n + Start address of the successful allocation. + + @dependencies + None. +*/ +unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags); +/** @endcond */ +/** @cond rest_reg_dist*/ +/**@ingroup func_qurt_lookup_physaddr2 + Translates the virtual memory address of the specified process to the 64-bit + physical memory address to which it is mapped. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[in] vaddr Virtual address. + @param[in] pid PID. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address is not mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid); +/** @endcond */ + +/**@ingroup func_qurt_mapping_attr_get + Gets the mapping attributes for a given virtual address and PID + + @datatypes + #qurt_addr_t \n + #qurt_mapping_attr_t + + @param[in] vaddr virtual address for which the attributes are required. + @param[in] pid process id for the target process + @param[out] attr Pointer to the mapping attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Incorrect virtual address or pid +*/ +int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr); + + +/**@ingroup func_qurt_mapping_attr_get_cache_mode + Gets the cache operation mode in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] cache_mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode) +{ + (*cache_mode) = attr->cache_mode; +} + +/**@ingroup func_qurt_mapping_attr_get_physaddr + Gets the physical memory address in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] physaddr Pointer to the destination variable for physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr) +{ + (*physaddr) = attr->paddr; +} + +/**@ingroup func_qurt_mapping_attr_get_perms + Gets the permissions in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_perm_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] perms Pointer to the destination variable for permissions. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms) +{ + (*perms) = attr->perms; +} + +/**@ingroup func_qurt_mapping_attr_get_size + Gets the size in the specified memory mapping attribute structure.This represents size of the + TLB entry which covers the virtual address. + + + @datatypes + #qurt_mapping_attr_t \n + #unsigned int + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] size Pointer to the destination variable for size. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size) +{ + (*size) = attr->size; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MEMORY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mmap.h new file mode 100755 index 0000000000000..c3bd875910af7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mmap.h @@ -0,0 +1,359 @@ +#ifndef QURT_MMAP_H +#define QURT_MMAP_H +/** + @file qurt_mmap.h + @brief Prototypes of memory mapping/unmapping APIs. + The APIs allow the user to map, un-map, and change permissions + on memory regions. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_mem_mmap + Creates a memory mapping with the specified attributes. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that specifies a pool handle + if the user wants to allocate memory from a specific pool. + The default value for this argument is NULL. + @param[in] pRegion Map region. This argument is unused, and the default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + @param[in] flags Mapping modes.\n + - #QURT_MAP_NAMED_MEMSECTION + - #QURT_MAP_FIXED \n + - #QURT_MAP_NONPROCESS_VPOOL \n + - #QURT_MAP_TRYFIXED \n + - #QURT_MAP_ANON \n + - #QURT_MAP_PHYSADDR \n + - #QURT_MAP_VA_ONLY @tablebulletend + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap2 + Creates a memory mapping with the specified attributes. Returns a more descriptive + error code in case of failure. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that allows the user to specify a pool handle + when the user wants to allocate memory from a specific pool. + Default value for this argument is NULL. + @param[in] pRegion Map region (unused argument); default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode. + @param[in] flags Mapping modes; + Shared, Private, or Anonymous. + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_EMEM -- Physical address is not available. \n + #QURT_EFAILED -- VA is not available or mapping failed.\n + #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA). + */ +void *qurt_mem_mmap2(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap_by_name + Creates a memory mapping for a named-memsection using the specified attributes. + The named memsection should be specified in cust_config.xml. + + @note1hang If the specified attributes are not valid or the named memsection is not found, + an error result is returned. + + @param[in] name Name of the memsection in cust_config.xml that specifies + this mapping. Should be less than 25 characters. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode + @param[in] flags Mapping modes, such as + Shared, Private, or Anonymous. + @param[in] offset Offset relative to the physical address range specified in memsection. + If offset + length exceeds size of memsection, failure is + returned. + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap_by_name(const char* name, + void *addr, + size_t length, + int prot, + int flags, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mprotect2 + Changes access permissions and attributes on an existing mapping based on the client_handle argument. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned. + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping.\n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect2(int client_handle, const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_mprotect + Changes access permissions and attributes on an existing mapping. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned.\n + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect(const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_munmap + Removes an existing mapping. + + @note1hang If the specified mapping is not found in the context of the caller process + or invalid attributes are passed, an error code is returned. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap(void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap2 + Removes an existing mapping for a specified process. + + @note1hang This API allows a root process entity, such as a driver, to remove mapping + that was created for a user process. If the specified mapping is not found in the context + of client handle or invalid attributes are passed, an error code is returned. + + @param[out] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap2(int client_handle, + void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap3 + Removes an existing mapping or reservation for a specified process. + + @param[in] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Pointer to a virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] flags Specifies the flag. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap3(int client_handle, + void *addr, + size_t length, + int flags); + +/* +|| The macros here follow the style of the standard mmap() macros, but with +|| QURT_ prepended to avoid name conflicts, and to avoid having a dependency +|| on sys/mman.h. +|| +|| Wherever possible, any values here that are also present in sys/mman.h +|| should have the same value in both places so that we can accept "mmap" +|| calls without having to remap parameters to new values. +|| +|| In the future, it would be desirable to have a regression test that +|| checks, for instance, that these macros match. Example: +|| +|| assert(QURT_MAP_FAILED == MAP_FAILED); +|| ... repeat as needed ... +*/ + +/** @addtogroup memory_mapping_macros +@{ */ +/** @cond */ +#define QURT_PROT_NONE 0x00U /**< */ +#define QURT_PROT_READ 0x01U /**< */ +#define QURT_PROT_WRITE 0x02U /**< */ +#define QURT_PROT_EXEC 0x04U /**< */ +#define QURT_PROT_NODUMP 0x08U /**< Skip dumping the mapping. During PD dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and the DSP process + crashes before the mapping is removed.*/ +#define QURT_PROT_ISLAND 0x10U /**< Island mapping. */ + +#define QURT_MAP_SHARED 0x0001U /**< Shared. */ +#define QURT_MAP_PRIVATE 0x0002U /**< Private. */ +/** @endcond */ +#define QURT_MAP_NAMED_MEMSECTION 0x0004U /**< Named memsection. */ +#define QURT_MAP_FIXED 0x0010U /**< Fixed virtual address. */ +#define QURT_MAP_RENAME 0x0020U /**< Rename. */ +#define QURT_MAP_NORESERVE 0x0040U /**< No reserve. */ +#define QURT_MAP_INHERIT 0x0080U /**< Inherit. */ +#define QURT_MAP_NONPROCESS_VPOOL 0x0100U /**< Use a virtual address outside of the default range of the + processes. This option is only supported in the root process + and only when virtual memory split is enabled in the XML. + The root process can use this flag to create mapping for a + user process, for example, if the virtual address is configured + for a 3G/1G split, the root process can use this flag to create + mapping in the top 1 GB area for the user process or the + lower 3 GB area for the root process. This is useful for + shared buffer use cases. */ +#define QURT_MAP_HASSEMAPHORE 0x0200U /**< Has semaphore. */ +#define QURT_MAP_TRYFIXED 0x0400U /**< Try to create a mapping for a virtual address that was passed. + If the passed virtual address fails, use a random virtual address. */ +#define QURT_MAP_WIRED 0x0800U /**< Wired. */ +#define QURT_MAP_FILE 0x0000U /**< File. */ +#define QURT_MAP_ANON 0x1000U /**< Allocate physical memory from the pool that was passed. + By default, memory is allocated from the default physpool. */ +#define QURT_MAP_VA_ONLY 0X2000U /**< Reserve a virtual address without + mapping it. */ + +/** @cond */ +#define QURT_MAP_ALIGNED(n) ((n) << QURT_MAP_ALIGNMENT_SHIFT) +#define QURT_MAP_ALIGNMENT_SHIFT 24 + + +#define QURT_MAP_ALIGNMENT_MASK QURT_MAP_ALIGNED(0xff) /**< */ +#define QURT_MAP_ALIGNMENT_64KB QURT_MAP_ALIGNED(16) /**< */ +#define QURT_MAP_ALIGNMENT_16MB QURT_MAP_ALIGNED(24) /**< */ +#define QURT_MAP_ALIGNMENT_4GB QURT_MAP_ALIGNED(32) /**< */ +#define QURT_MAP_ALIGNMENT_1TB QURT_MAP_ALIGNED(40) /**< */ +#define QURT_MAP_ALIGNMENT_256TB QURT_MAP_ALIGNED(48) /**< */ +#define QURT_MAP_ALIGNMENT_64PB QURT_MAP_ALIGNED(56) /**< */ +/** @endcond */ +#define QURT_MAP_FAILED ((void *) -1) /**< Mapping creation failed. */ + +/* +|| The macros below are extensions beyond the standard mmap flags, but follow +|| the style of the mmap flags. +*/ +/** @cond */ +// Describe bitfields in (prot) +#define QURT_PROT_CACHE_BOUNDS 16U,19U,7U /**< Bits 16 through 19 are cache attribute, default is 0. */ +#define QURT_PROT_BUS_BOUNDS 20U,21U,0U /**< Bits 20 through 21 are bus attributes, default is 0. */ +#define QURT_PROT_USER_BOUNDS 22U,23U,3U /**< Bits 22 through 23 are user mode, default is 3; + default of 3 means to derive user mode setting from the + default mode of the client. */ + +// Describe bitfields in (flags) +#define QURT_MAP_PHYSADDR_BOUNDS 15U,15U,0U /**< Bits 15 through 15 are physaddr, default is 0. */ +#define QURT_MAP_TYPE_BOUNDS 16U,19U,0U /**< Bits 16 through 19 are mapping type, default is 0. */ +#define QURT_MAP_REGION_BOUNDS 20U,23U,0U /**< Bits 20 through 23 are region type, default is 0. */ +/** @endcond */ + +// These macros get OR'ed into (prot) +#define QURT_PROT_CACHE_MODE(n) QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_BUS_ATTR(n) QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_USER_MODE(n) QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n)) /**< */ +// These macros get OR'ed into (flags) + +#define QURT_MAP_PHYSADDR QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. + This is allowed only for root process. */ +#define QURT_MAP_TYPE(n) QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_REGION(n) QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n)) /**< */ +/** @} */ /* end_addtogroup memory_mapping_macros */ +/** @cond */ +// These macros extract fields from (prot) +#define QURT_PROT_GET_CACHE_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_BUS_ATTR(n) QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_USER_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n)) /**< */ + +// These macros extract fields from (flags) +#define QURT_MAP_GET_TYPE(n) QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_GET_REGION(n) QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */ + +// Macros for bitfield insertion and extraction +#define QURT_MMAP_MASK(lo,hi) (~((~0u) << ((hi)-(lo)+1U))) /**< Mask of same size as [lo..hi]. */ +#define QURT_MMAP_BUILD_(lo,hi,def,n) ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */ +#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */ +#define QURT_MMAP_BUILD(a,b) QURT_MMAP_BUILD_(a,b) /**< */ +#define QURT_MMAP_EXTRACT(a,b) QURT_MMAP_EXTRACT_(a,b) /**< */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mq.h new file mode 100755 index 0000000000000..580c83d3de41a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mq.h @@ -0,0 +1,458 @@ +#ifndef QURT_MQ_H +#define QURT_MQ_H +/** + @file qurt_mq.h + + @brief Prototypes of secure message queues API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2019-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. +======================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_MQ_NAME_MAXLEN 16U /**< Maximum name length. */ + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/* This enum must be generated in accordance to process class class numbers. + For now it is made to match generated version, do not change this unless + there is a corresponding change in the process_class.py, indicies start from 0 + basically: QURT_MQ_SECURITY_SCOPE_ = (1 << QURTK_process_class_index_) +*/ +typedef enum { + QURT_MQ_SECURITY_SCOPE_KERNEL = ( 1U << 0 ), + QURT_MQ_SECURITY_SCOPE_SRM = ( 1U << 1 ), + QURT_MQ_SECURITY_SCOPE_SECURE = ( 1U << 2 ), + QURT_MQ_SECURITY_SCOPE_CPZ = ( 1U << 3 ), + QURT_MQ_SECURITY_SCOPE_ROOT = ( 1U << 4 ), + QURT_MQ_SECURITY_SCOPE_SIGNED = ( 1U << 5 ), + QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ), + QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 ) +} qurt_mq_security_scope_t; + +typedef enum { + QURT_MQ_CARDINALITY_PTP = (1U << 0), + QURT_MQ_CARDINALITY_MTO = (1U << 1) +}qurt_mq_cardinality_t; + +typedef unsigned int qurt_mqd_t; + +typedef union{ + struct { + unsigned int perms:2; + unsigned int cardinality:1; + unsigned int blocking:1; + + qurt_mq_security_scope_t creator_scope: 8; + qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO + unsigned int queue_closed: 1; + unsigned int reserved: 11; + }; //try to do anonymous struct + unsigned int raw; +} qurt_mq_flags_t; + + +/* permissions are from qurt_types.h , block X though */ +#if 0 +/** Memory access permission. */ +typedef enum { + QURT_PERM_READ=0x1U, /**< */ + QURT_PERM_WRITE=0x2U, /**< */ + QURT_PERM_EXECUTE=0x4U, /**< */ + QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE, /**< */ +} qurt_perm_t; +#endif + +struct qurt_mq_attr { + unsigned flags; /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */ + unsigned mq_maxmsg; /**< Maximum number of messages. Used with create() and get_attr. */ + unsigned short mq_send_msgsize; /**< Maximum size (bytes) of message in receiver facing queue, + from sender to receiver. */ + unsigned short mq_recv_msgsize; /**< Maximum size (bytes) of message in sender facing queue, + from receiver to sender. */ + unsigned client_pid; /**< Process ID of client that is allowed to open the message queue + that was created using qurt_mq_create(). */ + qurt_mq_cardinality_t cardinality; /**< Cardinality of message queue connection, see below. */ + qurt_mq_security_scope_t scope; /**< Security scope of the senders to the queue. */ +}; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mq_attr_init + Initializes attributes to default values used for creating the queue. + + The initialize operation sets the following default attribute values: \n + - flag - QURT_PERM_READ | QURT_PERM_WRITE \n + - maxmsg - 1 \n + - mq_send_msgsize - 8 \n + - mq_recv_msgsize - 8 \n + - sender_pid - -1 \n + - cardinality - QURT_MQ_CARDINALITY_PTP \n + - scope - QURT_MQ_SECURITY_SCOPE_SIGNED \n + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the initialized message queue object. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_init(struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_attr_set_send_msgsize + Sets the message size in bytes the sender can send. + Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_recv_msgsize + Sets the message size in bytes that the receiver can read. + Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_maxmsg + Sets the maximum message that can queue in the message queue. + Message depth is configurable using the XML configuration. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] depth Maximum message that can be queued. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth); + +/**@ingroup qurt_mq_attr_set_scope + Sets the scope of the message queue. A message queue created with a security + scope allows only a process class of that scope to open a message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mq_security_scope_t + + @param[in,out] attr Pointer to the message queue object. + @param[in] scope Scope of the message queue: \n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope); + + +/**@ingroup qurt_mq_attr_set_client_pid + Sets the client_pid that can open this message queue. + If client_pid is set, allowed_scope to open MQ shall not be considered. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] client_pid Valid PID for client process. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid); + +/**@ingroup qurt_mq_attr_set_flags + Sets the properties of the message queues. + The current implementation is only used to set the permission for the message queue using the flag attribute. + Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] flags Permission for message queue. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags); + +/**@ingroup qurt_mq_create + Create a message queue with the provided name and attributes. + The calling process becomes the owner of the queue. + Name of the message queue is limited to 16 characters including the NULL terminator. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue identifier if + the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] attr Pointer to the initialized message queue attribute + structure that specifies the attributes of the created message queue. + + @return + #QURT_EOK Message queue created. \n + #QURT_EINVALID Invalid arguments. \n + #QURT_ENOSPC Maximum number of queues in the system is exceeded. + + @dependencies + None. +*/ +int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_open + Opens a message queue connection between a process and a created message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue + identifier if the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] flags Flag that contains the properties that define the behavior of message queue connection. + Permissions:\n + #QURT_PERM_READ \n + #QURT_PERM_WRITE \n + #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend + Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n + Cardinality: \n + #QURT_MQ_CARDINALITY_PTP (default) \n + #QURT_MQ_CARDINALITY_MTO (not implemented) \n + Block suspend thread until the message queue with the apecified name is created. \n + Scope: security boundary to which the message queue and its users are constrained. + Block suspend thread until the message queue with the apecified name is created. \n + It is coupled with process privilege level/scope.\n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend + + @return + QURT_EOK -- Message queue connection successfully opened \n + QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n + QURT_ENOTALLOWED -- Open failed due to security scope mismatch + + @dependencies + None. +*/ +int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags); + +/**@ingroup qurt_mq_send + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send shall resume that thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] msg_len Length of the message buffer in bytes. + + @return + #QURT_EOK Message queue send was successful.\n + #QURT_EMSGSIZE Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED Send failed due to security scope mismatch. + + @dependencies + None. +*/ +int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); + +/**@ingroup qurt_mq_send_timed + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message or until timeout is reached. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] duration Interval (in microseconds) that the duration value must be + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in] msg_len Length of message buffer in bytes. + + @return + #QURT_EOK -- Message queue send was successful. \n + #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED -- Send failed due to security scope mismatch \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. +*/ +int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len); + + /**@ingroup qurt_mq_recv + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue. \n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv shall resume the thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in,out] msg_len Pointer to the length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID Message pointer or msg_len ptr are NULL. \n + #QURT_EBADR Message queue descriptior (mqd) is invalid. \n + #QURT_EBADF Sender closed the message queue. + + @dependencies + None. +*/ +int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len); + + /**@ingroup qurt_mq_recv_timed + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue or until timeout is reached.\n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in] duration Interval (in microseconds) that the duration value must be; + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in,out] msg_len Pointer to length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID -- Message ptr or msg_len ptr are NULL. \n + #QURT_EBADR -- Message queue descriptior (mqd) is invalid.\n + #QURT_EBADF -- Sender closed the message queue. \n + #QURT_ETIMEDOUT -- Timeout. + + @dependencies + None. +*/ +int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len); + + /**@ingroup qurt_mq_close + Closes the message queue and disassociates the calling process (client) from the message queue + under this descriptor. Marks the queue as closed for the receiver. + This function is expected to be called from the client side. If called + from the server side, the function reduces to no-op and returns success. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue close was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_close(qurt_mqd_t mqd); + + /**@ingroup qurt_mq_destroy + Destroys the message queue. This function ought to be + called from the process that called qurt_mq_create(). + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue destroy was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_destroy(qurt_mqd_t mqd); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif //QURT_MQ_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mutex.h new file mode 100755 index 0000000000000..4ad6b270cdde6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mutex.h @@ -0,0 +1,211 @@ +#ifndef QURT_MUTEX_H +#define QURT_MUTEX_H +/** + @file qurt_mutex.h + @brief Prototypes of mutex API. + This is mostly a user space mutex, but calls the + kernel to block if the mutex is taken. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT mutex type. + + Both non-recursive mutex lock and unlock, and recursive + mutex lock and unlock can be applied to this type. + */ +typedef union qurt_mutex_aligned8{ + /** @cond */ + struct { + unsigned int holder; + unsigned int count; + unsigned int queue; + unsigned int wait_count; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_mutex_t; +/** @} */ /* end_addtogroup mutex_types */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* @addtogroup mutex_const_macros +@{ */ +#define MUTEX_MAGIC 0xfe /**< */ +#define QURTK_FUTEX_FREE_MAGIC 0x1F // 11111 /**< */ +#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}} /**< Suitable as an initializer for a + variable of type qurt_mutex_t. */ +/* @} */ /* end_addtogroup mutex_const_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mutex_init + Initializes a mutex object. + The mutex is initially unlocked. + + @note1hang Each mutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_mutex_destroy() + when this object is not used anymore + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the mutex object. Returns the initialized object. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_destroy + Destroys the specified mutex. + + @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_lock + Locks the specified mutex. + If a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. + + @note1hang A thread is suspended indefinitely if it locks a mutex that it has already + locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}). + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_lock(qurt_mutex_t *lock); /* blocking */ + +/**@ingroup func_qurt_mutex_lock_timed + Locks the specified mutex. + When a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + When a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. If the duration of suspension exceeds the timeout duration, wait is + terminated and no access to mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object; specifies the mutex to lock. + @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration); + +/**@ingroup func_qurt_mutex_unlock + Unlocks the specified mutex. \n + More than one thread can be suspended on a mutex. When the mutex is unlocked, only the + highest-priority thread waiting on the mutex is awakened. If the awakened thread has + higher priority than the current thread, a context switch occurs. + + @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first + lock. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to unlock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_unlock(qurt_mutex_t *lock); /* unlock */ + +/**@ingroup func_qurt_mutex_try_lock + @xreflabel{hdr:qurt_mutex_try_lock} + Attempts to lock the specified mutex. + If a thread performs a try_lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + @note1hang If a thread performs a try_lock operation on a mutex that it has already locked + or is in use by another thread, qurt_mutex_try_lock immediately returns with a + nonzero result value. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_mutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_os_services.h new file mode 100755 index 0000000000000..cbc4c239e9620 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_os_services.h @@ -0,0 +1,24 @@ +/*============================================================================= + + qurt_os_services.c + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ + +#define QURT_OS_SERVICE_THREAD "/os/thread" /**< Thread service */ +#define QURT_OS_SERVICE_FS_HUB "/os/fs_hub" /**< file-system hub */ +#define QURT_OS_SERVICE_CALLBACK "/os/callback" /**< QDI callback service */ +#define QURT_OS_SERVICE_INTERRUPTS "/os/interrupt" /**< Interrupt service */ +#define QURT_OS_SERVICE_PROXY "/os/proxy" /**< QDI proxy serice */ +#define QURT_OS_SERVICE_MEMORY "/os/memory" /**< Memory management service */ +#define QURT_OS_SERVICE_MEMPOOL "/os/mempool" /**< Pool management service */ +#define QURT_OS_SERVICE_PROCESS "/os/process" /**< Process management service */ +#define QURT_OS_SERVICE_MMAP "/os/mem_mapper" /**< mmapper service */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex.h new file mode 100755 index 0000000000000..61aee5cba7ce8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_PIMUTEX_H +#define QURT_PIMUTEX_H 1 +/** + @file qurt_pimutex.h + @brief Prototypes of qurt_pimutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pimutex_init + Initializes a priority inheritance mutex object. + The priority inheritance mutex is initially unlocked. + + This function works the same as qurt_mutex_init(). + + @note1hang Each pimutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_pimutex_destroy() + when this object is not used anymore + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the priority inheritance mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_destroy + Destroys the specified priority inheritance mutex. + + @note1hang Priority inheritance mutexes must be destroyed when they are no longer in + use. Failure to do this causes resource leaks in the QuRT kernel.\n + @note1cont Priority inheritance mutexes must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_lock + Requests access to a shared resources. If a thread performs a lock operation on a mutex + that is not in use, the thread gains access to the shared resource that the mutex protects, + and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + If a thread is suspended on a priority inheritance mutex, and the priority of the suspended + thread is higher than the priority of the thread that has locked the mutex, the thread + with the mutex acquires the higher priority of the suspended thread. The locker thread blocks + until the lock is available. + + @note1hang A thread is not suspended if it locks a priority inheritance mutex that it has + already locked . However, the mutex does not become available to other + threads until the thread performs a balanced number of unlocks on the mutex.\n + @note1cont When multiple threads compete for a mutex, the lock operation for a priority + inheritance mutex is slower than it is for a recursive mutex. + In particular, it is about 10 times slower when the mutex is available for locking, + and slower (with greatly varying times) when the mutex is already locked. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_lock(qurt_mutex_t *lock); + + +/**@ingroup func_qurt_pimutex_lock_timed + Locks a priority inheritance mutex with timeout. + + A thread can lock a priority inheritance mutex for multiple times. The mutex is not + available to other threads until the thread performs the same number of mutex unlock + operations. + + If a thread performs a lock operation on a mutex that is already locked by another thread, + the thread is moved to waiting state. When the mutex becomes available again (because the + other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex. + + If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread + is higher than the priority of the thread that has locked the mutex, the priority of the thread + that has locked the mutex is raised to the same priority of the waiting thread. + + If the duration of waiting exceeds the timeout duration, the waiting is terminated, and + the function returns QURT_ETIMEDOUT as a failure of the mutex lock. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to lock. + @param[in] duration Duration (in microseconds) to wait. The duration value must be between + #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + #QURT_EINVALID -- Duration is out of range + + @dependencies + None. + + */ +int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + + +/**@ingroup func_qurt_pimutex_unlock + Releases access to a shared resource; unlocks the specified priority inheritance mutex. \n + More than one thread can be suspended on a priority inheritance mutex. When the mutex + is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + When a thread unlocks a priority inheritance mutex, its thread priority is restored to its + original value from any higher priority value that it acquired from another thread + suspended on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_try_lock + Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n + If a thread performs a try_lock operation on a priority inheritance mutex that is not in + use, the thread gains access to the shared resource that is protected by the mutex, and + continues executing. + If a thread performs a try_lock operation on a priority inheritance mutex that is already + in use by another thread, qurt_pimutex_try_lock immediately returns with a + nonzero result value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_pimutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex2.h new file mode 100755 index 0000000000000..b809f163cbfd2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex2.h @@ -0,0 +1,162 @@ +#ifndef QURT_PIMUTEX2_H +#define QURT_PIMUTEX2_H +/** + @file qurt_pimutex2.h + @brief Prototypes of pimutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_pimutex2_init + Initializes a recursive mutex object. + + @deprecated use #qurt_pimutex_init instead. + + The recursive mutex is initially unlocked. + + Objects of type pimutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_destroy + + @deprecated use #qurt_pimutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1cont Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code should destroy an pimutex2 object prior to + deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures + that all qurt_pimutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_lock + + @deprecated use #qurt_pimutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not being used, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing. + + If a thread performs a lock operation on a recursive mutex that is already being used by + another thread, the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_unlock + + @deprecated use #qurt_pimutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_pimutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_pimutex2_lock(). If a call to qurt_pimutex2_lock() would + succeed immediately, this function behaves similarly, and returns 0 for success. + If a call to qurt_pimutex2_lock() would not succeed immediately, this function has + no effect and returns non-zero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pipe.h new file mode 100755 index 0000000000000..6bdaa044f8640 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pipe.h @@ -0,0 +1,479 @@ +#ifndef QURT_PIPE_H +#define QURT_PIPE_H +/** + @file qurt_pipe.h + @brief Prototypes of the pipe interface API + This is a pipe or message queue + It blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup pipe_types +@{ */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_PIPE_MAGIC 0xF1FEF1FE /**< Magic. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */ + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** QuRT pipe data values type. */ +typedef unsigned long long int qurt_pipe_data_t; + +/** QuRT pipe type.*/ +typedef struct { + /** @cond */ + qurt_mutex_t pipe_lock; + qurt_sem_t senders; + qurt_sem_t receiver; + unsigned int size; + unsigned int sendidx; + unsigned int recvidx; + void (*lock_func)(qurt_mutex_t *); + void (*unlock_func)(qurt_mutex_t *); + int (*try_lock_func)(qurt_mutex_t *); + void (*destroy_lock_func)(qurt_mutex_t *); + unsigned int magic; + qurt_pipe_data_t *data; + /** @endcond */ +} qurt_pipe_t; + +/** QuRT pipe attributes type. */ +typedef struct { + /** @cond */ + qurt_pipe_data_t *buffer; + unsigned int elements; + unsigned char mem_partition; + /** @endcond */ +} qurt_pipe_attr_t; + +/** @} */ /* end_addtogroup pipe_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pipe_attr_init + @xreflabel{hdr:qurt_pipe_attr_init} + Initializes the structure that sets the pipe attributes when a pipe is created. + + After an attribute structure is initialized, the individual attributes in the structure are + explicitly set using the pipe attribute operations. + + The attribute structure is assigned the following default values: \n + - buffer -- 0 \n + - elements -- 0 \n + - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr) +{ + attr->buffer = NULL; + attr->elements = 0; + attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer + @xreflabel{sec:qurt_pipe_attr_set_buffer} + Sets the pipe buffer address attribute.\n + Specifies the base address of the memory area to use for the data buffer of a pipe. + + The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the + memory area used as a pipe data buffer. The user is responsible for allocating the + memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t \n + #qurt_pipe_data_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] buffer Pointer to the buffer base address. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer) +{ + attr->buffer = buffer; +} + +/**@ingroup func_qurt_pipe_attr_set_elements + @xreflabel{sec:qurt_pipe_attr_set_elements} + Specifies the length of the memory area to use for the data buffer of a pipe. + + The length is expressed in terms of the number of 64-bit data elements that + can be stored in the buffer. + + The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify + the memory area used as a pipe data buffer. The user is responsible for + allocating the memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] elements Pipe length (64-bit elements). + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements) +{ + attr->elements = elements; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer_partition + @xreflabel{sec:qurt_pipe_attr_set_buffer_partition} + Specifies the memory type where a pipe's buffer is allocated. + Allocate pipes in RAM or TCM/LPM. + + @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created + with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] mem_partition Pipe memory partition. Values: \n + - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n + - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition) +{ + attr->mem_partition = mem_partition; +} + +/**@ingroup func_qurt_pipe_create + Creates a pipe.\n + Allocates a pipe object and its associated data buffer, and initializes the pipe object. + + @note1hang The buffer address and size stored in the attribute structure specify how the + pipe data buffer is allocated. + + @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created + using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the created pipe object. + @param[in] attr Pointer to the attribute structure used to create the pipe. + + @return + #QURT_EOK -- Pipe created. \n + #QURT_EFAILED -- Pipe not created. \n + #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM. + + @dependencies + None. + */ +int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_init + Initializes a pipe object using an existing data buffer. + + @note1hang The buffer address and size stored in the attribute structure must + specify a data buffer that the user has already allocated. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the pipe object to initialize. + @param[in] attr Pointer to the pipe attribute structure used to initialize the pipe. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Failure. + + @dependencies + None. + */ +int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_destroy + @xreflabel{sec:qurt_pipe_destroy} + Destroys the specified pipe. + + @note1hang Pipes must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel. + Pipes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_destroy(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_delete + Deletes the pipe.\n + Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its + associated data buffer. + + @note1hang Delete pipes only if they were created using qurt_pipe_create + (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n + @note1cont Pipes must be deleted when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Pipes must not be deleted while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_delete(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_send + Writes a data item to the specified pipe. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + None. + + @dependencies + None. +*/ +void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_receive + Reads a data item from the specified pipe. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + Integer containing the 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_try_send + Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n + + If a thread writes to a full pipe, the operation returns immediately with success set to -1. + Otherwise, success is always set to 0 to indicate a successful write operation. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + 0 -- Success. \n + -1 -- Failure (pipe full). + + @dependencies + None. +*/ +int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_try_receive + Reads a data item from the specified pipe (without suspending the thread if the pipe is + empty).\n + If a thread reads from an empty pipe, the operation returns immediately with success set + to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n + + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[out] success Pointer to the operation status result. + + @return + Integer containing a 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success); + +/**@ingroup func_qurt_pipe_receive_cancellable + Reads a data item from the specified pipe (with suspend), cancellable. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + The operation is cancelled if the user process of the calling thread is killed, + or if the calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] result Pointer to the integer containing the 64-bit data item from pipe. + + @return + #QURT_EOK -- Receive completed. \n + #QURT_ECANCEL -- Receive canceled. \n + #QURT_EDESTROY -- Receive destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result); + +/**@ingroup func_qurt_pipe_send_cancellable + @xreflabel{hdr:qurt_pipe_send_cancellable} + Writes a data item to the specified pipe (with suspend), cancellable. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + The operation is canceled if the user process of the calling thread is killed, or if the + calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] data Data item to write. + + @return + #QURT_EOK -- Send completed. \n + #QURT_ECANCEL -- Send canceled. \n + #QURT_EDESTROY -- Send destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_is_empty + Returns a value indicating whether the specified pipe contains any data. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + 1 -- Pipe contains no data. \n + 0 -- Pipe contains data. + + @dependencies + None. +*/ +int qurt_pipe_is_empty(qurt_pipe_t *pipe); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIPE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmem_manager.h new file mode 100755 index 0000000000000..8c8da985228b9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmem_manager.h @@ -0,0 +1,82 @@ +#ifndef QURT_PMEM_MANAGER_H +#define QURT_PMEM_MANAGER_H +/** + @file qurt_pmem_manager.h + Prototypes of kernel physical memory manager APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* physical memory API return error code */ +#define QURT_PMEM_SUCCESS 0 +#define QURT_PMEM_NO_PRIV 1 +#define QURT_PMEM_RETRY 2 +#define QURT_PMEM_OVERLAP 3 +#define QURT_PMEM_NOT_EXIST 4 +#define QURT_PMEM_INIT_FAILURE 5 +#define QURT_PMEM_OUTSTANDING_MAPPING 6 +#define QURT_PMEM_GENERIC_FAILURE 7 +#define QURT_PMEM_ENTRY_FOUND 8 +#define QURT_PMEM_REACH_END 9 +#define QURT_PMEM_UNCLAIMED 10 +#define QURT_PMEM_ALREADY_CLAIMED 11 + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_pmem_acquire + Acquire the ownership of a specific physical memory region. + + @note1hang The ownership will be the caller + + @param[in] ppage Starting physical page number + @param[in] pnum Number of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. +*/ +int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum); + +/**@ingroup func_qurt_pmem_release + Release the ownership of a specific physical memory region. + + @param[in] ppage The start of physical page number + @param[in] pnum The numbers of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_NOT_EXIST -- The physical memory range is not usable. \n + #QURT_PMEM_OUTSTANDING_MAPPING -- There is outstanding mapping in this range + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. + */ +int qurt_pmem_release(unsigned int ppage, unsigned int pnum); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMEM_MANAGER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmu.h new file mode 100755 index 0000000000000..73ea8eba04abf --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmu.h @@ -0,0 +1,121 @@ +#ifndef QURT_PMU_H +#define QURT_PMU_H +/** + @file qurt_pmu.h + Prototypes of pipe interface API. + A pipe or message queue blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_pmu_set + Sets the value of the specified PMU register. + + @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0 + through PMUCNT3. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @param[in] reg_value Register value. + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_set (int reg_id, unsigned int reg_value); + +/**@ingroup func_qurt_pmu_get + Gets the PMU register.\n + Returns the current value of the specified PMU register. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @return + Integer -- Current value of the specified PMU register. + + @dependencies + None. + */ +unsigned int qurt_pmu_get (int reg_id); + +/**@ingroup func_qurt_pmu_enable + Enables or disables the Hexagon processor PMU. + Profiling is disabled by default. + + @note1hang Enabling profiling does not automatically reset the count registers -- this must + be done explicitly before starting event counting. + + @param[in] enable Performance monitor. Values: \n + - 0 -- Disable performance monitor \n + - 1 -- Enable performance monitor @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_enable (int enable); + +/**@ingroup func_qurt_pmu_get_pmucnt + Reads PMU counters in a single trap. + + @param[out] buf Pointer to a buffer to save values read from PMU counters. + buffer size should be at least 32 bytes to read all eight PMU counters. + + @return + #QURT_EOK -- Successful read.\n + #QURT_EFATAL -- Failure. + + @dependencies + None. + */ +int qurt_pmu_get_pmucnt (void * buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMU_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_power.h new file mode 100755 index 0000000000000..2ee4d29a73976 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_power.h @@ -0,0 +1,140 @@ +#ifndef QURT_POWER_H +#define QURT_POWER_H +/** + @file qurt_power.h + @brief Prototypes of power API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/*============================================================================= + + EDIT HISTORY FOR MODULE + + This section contains comments describing changes made to the module. + Notice that changes are listed in reverse chronological order. + + +when who what, where, why +-------- --- ------------------------------------------------------------ +03/03/11 op Add header file +12/12/12 cm (Tech Pubs) Edited/added Doxygen comments and markup. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +/**@ingroup func_qurt_power_shutdown_fail_exit + Returns from Power Collapse mode when power collapse cannot proceed. + + This function unmasks the global interrupt. This operation is used only when the thread is + recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}). + + @return + #QURT_EOK -- Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_fail_exit qurt_power_exit + +/**@ingroup func_qurt_power_shutdown_exit + Undoes state changes made preparing for power collapse.\n + This function unmasks the global interrupts. + + @return + #QURT_EOK --Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_exit qurt_power_exit +/**@endcond */ + +/**@ingroup func_qurt_system_ipend_get + Gets the IPEND register.\n + + @note1hang Returns the current value of the Hexagon processor IPEND register. The return value + is a mask value that identifies the individual interrupts that are pending. \n + + @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A + mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the + corresponding interrupt is not pending. \n + + @return + Return the IPEND register value. + + @dependencies + None. + */ +unsigned int qurt_system_ipend_get (void); + + +/**@ingroup func_qurt_system_vid_get + Gets the VID register. \n + + @note1hang Returns the current value of the Hexagon processor VID register. The return value is + the vector number of a second-level interrupt that has been accepted by the Hexagon + processor core.\n + + @return + Return the VID register value that is the L2 VIC interrupt number accepted by the processor. + Valid range is 0 to 1023. + + @dependencies + None. + */ +unsigned int qurt_system_vid_get(void); + +/**@ingroup func_qurt_power_shutdown_get_pcycles + Gets the number of power collapses and processor cycles for entering and exiting most recent + power collapse. + + @note1hang If no power collapse has occured yet, processor cycle numbers are zero. + + @param[out] enter_pcycles Number of processor cycles for entering most + recent power collapse. + @param[out] exit_pcycles Number of processor cycles for exiting most + recent power collapse. + @return + Zero -- No power collapses have occurred. \n + Nonzero -- Number of power collapses that have occurred since + the processor was reset. + + @dependencies + None. + */ +int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles, unsigned long long *exit_pcycles ); + +/**@ingroup func_qurt_system_tcm_set_size + Set size of TCM to save during full power collapse. + + @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in + XML, the size is truncated to the size defined in XML. + + @param[in] new_size Size of TCM to save. + + @return + Zero -- Size successfully set \n + -1 -- Size of 0 passed + + @dependencies + None. + */ +int qurt_system_tcm_set_size(unsigned int new_size); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_POWER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_printf.h new file mode 100755 index 0000000000000..a775d8a815918 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_printf.h @@ -0,0 +1,44 @@ +#ifndef QURT_PRINTF_H +#define QURT_PRINTF_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + @file qurt_printf.h + Prototypes of printf API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup chapter_function_tracing +@{ */ + +int qurt_printf(const char* format, ...); + +int qurt_vprintf(const char* format, va_list args); + +/** @} */ /* end_addtogroup chapter_function_tracing */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PRINTF_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_process.h new file mode 100755 index 0000000000000..0df9ddc2d4a70 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_process.h @@ -0,0 +1,995 @@ +#ifndef QURT_PROCESS_H +#define QURT_PROCESS_H +/** + @file qurt_process.h + @brief Prototypes of QuRT process control APIs. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_callback.h" +#include "qurt_consts.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup process_types +@{ */ +#define QURT_PROCESS_ATTR_NAME_MAXLEN QURT_MAX_NAME_LEN /**< Maximum length of the process name. */ +#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN 128 /**< Maximum length of the path of binary/ELF for this process. */ +#define QURT_PROCESS_ATTR_CAP_MAXLEN 128 /**< Maximum length for a resource name. */ + +/** QuRT process capability wildcard strings */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL "ALLOW_ALL" /**< Capability wild-card for full access */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE "ALLOW_NONE" /**< Capability wild-card for no access */ + +/** QuRT process capability states */ +#define QURT_PROCESS_ATTR_CAP_ENABLED 0x1 /**< Capability enabled*/ +#define QURT_PROCESS_ATTR_CAP_DISABLED 0x0 /**< Capability disabled*/ + +/* QuRT process thread attributes. */ +#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0 /**< Default ceiling priority of the threads in the new process. */ +#define QURT_PROCESS_DEFAULT_MAX_THREADS -1 /**< Default number of threads in the new process. + -1 indicates that the limit is set to the maximum supported by the system. */ + +/* QuRT process flags. */ +#define QURT_PROCESS_SUSPEND_ON_STARTUP (1U) /**< Suspend the new processes just before calling main(). */ +#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */ +#define QURT_PROCESS_ISLAND_RESIDENT (1u << 2) /**< Process is island resident. */ +#define QURT_PROCESS_RESTARTABLE (1u << 3) /**< Indicates that the process is restartable */ +#define QURT_PROCESS_UNTRUSTED (1u << 7) /**< Starts the new process as unsigned process. */ + +/* QuRT process debugging session status.*/ +#define QURT_DEBUG_NOT_START 0 /**< Debug is not started. */ +#define QURT_DEBUG_START 1 /**< Debug has started. */ + +/** Process Suspend Options */ +#define QURT_PROCESS_SUSPEND_DEFAULT 0 + +/** Process Resume Options */ +#define QURT_PROCESS_RESUME_DEFAULT 0 + + +/* QuRT process types. */ +typedef enum { + QURT_PROCESS_TYPE_RESERVED, /**< Process type is reserved. \n */ + QURT_PROCESS_TYPE_KERNEL, /**< Kernel process. \n*/ + QURT_PROCESS_TYPE_SRM, /**< SRM process. \n*/ + QURT_PROCESS_TYPE_SECURE, /**< Secure process. \n*/ + QURT_PROCESS_TYPE_ROOT, /**< Root process. \n*/ + QURT_PROCESS_TYPE_USER, /**< User process. */ +}qurt_process_type_t; + +/** QuRT process callback types. */ +typedef enum { + QURT_PROCESS_DUMP_CB_ROOT, /**< Register the callback that executes in the + root process context. \n */ + QURT_PROCESS_DUMP_CB_ERROR, /**< Register the user process callback that is + called after threads in the process are frozen. \n */ + QURT_PROCESS_DUMP_CB_PRESTM, /**< Register the user process callback that is + called before threads in the process are frozen. \n*/ + QURT_PROCESS_DUMP_CB_MAX /**< Reserved for error checking. */ +}qurt_process_dump_cb_type_t; + +/** QuRT process dump attributes. */ +typedef struct _qurt_pd_dump_attr{ + /** @cond */ + unsigned int enabled; /**< Process dump is enabled. */ + const char *path; /**< Process dump path. */ + unsigned int path_len; /**< Length of process dump path. */ + /** @endcond */ +}qurt_pd_dump_attr_t; + +/** QuRT process capability resource type */ +enum qurt_process_cap_type_t { + QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0, /**< Number of entries in the capability structure*/ + QURT_PROCESS_CAP_TYPE_DRIVER=1, /**< Driver resource */ + QURT_PROCESS_CAP_TYPE_MAX /**< Maximum identifier */ +}; + +/** QuRT process capability structure */ +typedef struct _qurt_capability { + enum qurt_process_cap_type_t type; /**< Resource type */ + char name[QURT_PROCESS_ATTR_CAP_MAXLEN]; /**< Resource name*/ + unsigned long long cap; /**< Capabilities allowed for this resource */ +}qurt_capability_t; + +/** QuRT process attributes. */ +typedef struct _qurt_process_attr { + /** @cond */ + char name[QURT_PROCESS_ATTR_NAME_MAXLEN]; /**< Name of the new process. */ + char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the binary for the new process. */ + char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the DTB ELF for the new process. */ + int flags; /**< Flags as indicated by QuRT process flags. */ + unsigned int sw_id; /**< Software ID of the process be load. */ + unsigned sid; /**< Stream ID of the process being spawned. */ + unsigned max_threads; /**< Maximum number of threads that the new process can create. */ + unsigned short ceiling_prio; /**< Maximum priority at which threads can be + created by new process. */ + qurt_process_type_t type; /**< Process type as indicated by + #qurt_process_type_t. */ + qurt_pd_dump_attr_t dump_attr; /**< Process dump attributes for the new process + as indicated by #qurt_pd_dump_attr_t. */ + qurt_capability_t *capabilities; /**< Pointer to array of structure of type + qurt_capability_t */ + /** @endcond */ +} qurt_process_attr_t; + +/** @} */ /* end_addtogroup process_types */ + +/*============================================================================= +FUNCTIONS +=============================================================================*/ + /** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_create + Creates a process with the specified attributes, and starts the process. + + The process executes the code in the specified executable ELF file. + + @datatypes + #qurt_process_attr_t + + @param[out] attr Accepts an initialized process attribute structure, which specifies + the attributes of the created process. + + @return + Postive return value Indicates Process ID. + Negative return value Indicates any of follwoing error, + #-QURT_EPRIVILEGE -- Caller does not have privilege for this operation \n + #-QURT_EMEM -- Not enough memory to perform the operation \n + #-QURT_EFAILED -- Operation failed \n + #-QURT_ENOTALLOWED -- Operation not allowed \n + #-QURT_ENOREGISTERED -- Not registered \n + #-QURT_ENORESOURCE -- Resource exhaustion \n + #-QURT_EINVALID -- Invalid argument value + #QURT_EFATAL -- attr is NULL + + @dependencies + None. +*/ +int qurt_process_create (qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_get_id + Returns the process identifier for the current thread. + + @return + None. + + @dependencies + Process identifier for the current thread. +*/ +int qurt_process_get_id (void); +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_get_uid + Returns the user identifier for the current thread. + + @return + None. + + @dependencies + User identifier for the current thread. +*/ +int qurt_process_get_uid (void); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_init + Initializes the structure that sets the process attributes when a thread is created. + + After an attribute structure is initialized, the individual attributes in the structure can + be explicitly set using the process attribute operations. + + Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize + operation. + + @inputov{table_process_attribute_defaults} + + @datatypes + #qurt_process_attr_t + + @param[out] attr Pointer to the structure to initialize. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_process_attr_init (qurt_process_attr_t *attr) +{ + attr->name[0] = '\0'; + attr->path[0] = '\0'; + attr->dtb_path[0] = '\0'; + attr->flags = 0; + attr->sw_id = 0; + attr->sid = 0; + attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS; + attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO; + attr->type = QURT_PROCESS_TYPE_RESERVED; + attr->dump_attr.enabled = 0; + attr->dump_attr.path = NULL; + attr->dump_attr.path_len = 0; + attr->capabilities = NULL; +} + +/**@ingroup func_qurt_process_attr_set_executable + Sets the process name in the specified process attribute structure. + + Process names identify process objects that are already + loaded in memory as part of the QuRT system. + + @note1hang Process objects are incorporated into the QuRT system at build time. + + @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] name Pointer to the process name. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name); + +/**@ingroup func_qurt_process_attr_set_binary_path + Sets the binary path for the process loading in the specified process attribute structure. + + Path specifies the binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_dtb_path + Sets the DTB binary path for the process loading in the specified process attribute structure. + + Path specifies the DTB binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_flags +Sets the process properties in the specified process attribute structure. +Process properties are represented as defined symbols that map into bits +0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing +together the individual property symbols. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical. + This attribute will be used by error services, + to decide whether to kill user pd or whole subsystem. + QURT_PROCESS_ISLAND_RESIDENT Process will be marked as island resident. + QURT_PROCESS_RESTARTABLE Process will be marked as restartable. + QURT_PROCESS_UNTRUSTED Process will be marked as unsigned process. +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags) +{ + attr->flags = flags; +} +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_sid +Sets the process streamID in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sid streamID to set for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid) +{ + attr->sid = sid; +} +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_set_max_threads +Sets the maximum number of threads allowed in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] max_threads Maximum number of threads allowed for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads) +{ + attr->max_threads = max_threads; +} + +/**@ingroup func_qurt_process_attr_set_sw_id +Sets the software ID of the process to load in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sw_id Software ID of the process, used in authentication. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id) +{ + attr->sw_id = sw_id; +} + +/**@ingroup func_qurt_process_attr_set_ceiling_prio +Sets the highest thread priority allowed in the specified process attribute structure. +Refer qurt_thread.h for priority ranges. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] prio Priority. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio) +{ + attr->ceiling_prio = prio; +} +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_dump_status +Sets the process domain dump-enabled field in the process domain dump attributes. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] enabled 1 -- Process domain dump is collected \n + 0 -- Process domain dump is not collected + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled) +{ + attr->dump_attr.enabled = enabled; +} + +/**@ingroup func_qurt_process_attr_set_dump_path +Sets the process domain dump path and type. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] path Path where the process domain dumps must be saved. +@param[in] path_len Length of the path string. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len) +{ + attr->dump_attr.path = path; + attr->dump_attr.path_len = (unsigned int)path_len; +} + +/**@ingroup func_qurt_process_attr_set_capabilities +Sets list of capabilities available to this process. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] capabilities Pointer to array of structures of type qurt_capability_t defining + resources and capabilites + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities) +{ + attr->capabilities = capabilities; +} + +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_cmdline_get +Gets the command line string associated with the current process. +The Hexagon simulator command line arguments are retrieved using +this function as long as the call is made +in the process of the QuRT installation, and with the +requirement that the program runs in a simulation environment. + +If the function modifies the provided buffer, it zero-terminates +the string. It is possible that the function does not modify the +provided buffer, so the caller must set buf[0] to a NULL +byte before making the call. A truncated command line is returned when +the command line is longer than the provided buffer. + +@param[in] buf Pointer to a character buffer that must be filled in. +@param[in] buf_siz Size (in bytes) of the buffer pointed to by the buf argument. + +@return +None. + +@dependencies +None. +*/ +void qurt_process_cmdline_get(char *buf, unsigned buf_siz); + +/**@ingroup func_qurt_process_get_thread_count +Gets the number of threads present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of threads in the process indicated by PID, if positive value is obtained +Negative error code if failed include: + QURT_EFATAL - Invalid PID + -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID + +@dependencies +None. +*/ +int qurt_process_get_thread_count(unsigned int pid); + +/**@ingroup func_qurt_process_get_thread_ids +Gets the thread IDs for a process indicated by PID. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a user passed buffer that must be filled in with thread IDs. +@param[in] thread_num Number of thread IDs requested. + +@return +#QURT_EOK - Success +#QURT_EFATAL - Failed, ptr is NULL + +@dependencies +None. + */ +int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num); +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_dump_get_mem_mappings_count +Gets the number of mappings present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of mappings for the process indicated by the PID. + +@dependencies +None. +*/ +int qurt_process_dump_get_mem_mappings_count(unsigned int pid); + +/**@ingroup func_qurt_process_dump_get_mappings +Gets the mappings for a specified PID. + +@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a buffer that must be filled in with mappings. +@param[in] count Count of mappings requested. + +@return +Number of mappings filled in the buffer passed by the user. + +@dependencies +None. +*/ +int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_get +Gets the attributes of the process with which it was created. + +@datatypes +#qurt_process_attr_t + +@param[in] pid PID of the process for which the information is required. +@param[in,out] attr Pointer to the user allocated attribute structure. + +@return +#QURT_EOK - Success +#QURT_INVALID - Invalid PID +#QURT_EFATAL - attr is NULL + +@dependencies +None. +*/ +int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_dump_register_cb +Registers the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information. +@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n + #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n + #QURT_PROCESS_DUMP_CB_ERROR -- After threads are frozen and captured. \n + #QURT_PROCESS_DUMP_CB_ROOT -- After threads are frozen and captured, and CB_ERROR type of callbacks + are called. +@param[in] priority Priority. + +@return +#QURT_EOK -- Success \n +Other values -- Failure + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority); + +/**@ingroup func_qurt_process_dump_deregister_cb +Deregisters the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information to deregister. +@param[in] type Callback type. + +@return +#QURT_EOK -- Success.\n +Other values -- Failure. + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type); + +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_set_rtld_debug +Sets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in] address rtld_debug address. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address); + +/**@ingroup func_qurt_process_get_rtld_debug +Gets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address); +/** @endcond */ +/**@ingroup func_qurt_process_exit +Exits the current user process with an exit code. + +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exit(int exitcode); + +/**@ingroup func_qurt_process_kill +Kills the process represented by the PID with the exit code. + +@param[in] pid PID of the process to kill. +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_kill(int pid, int exitcode); + + +/**@ingroup func_qurt_debugger_register_process +Registers the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. +@param[in] adr Address. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_register_process(int pid, unsigned int adr); + + +/**@ingroup func_qurt_debugger_deregister_process +Deregister the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_deregister_process(int pid); + +/**@ingroup func_qurt_process_exec_callback +Executes callbacks in the user process as indicated by the client_handle argument. + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] callback_fn Callback function to execute. +@param[in] stack_base Stack address to use. +@param[in] stack_size Stack size. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exec_callback(int client_handle, + unsigned callback_fn, + unsigned stack_base, + unsigned stack_size); + +/**@ingroup func_qurt_process_get_pid +Gets the process ID of the process that the client_handle argument represents. + +@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id() + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] pid Pointer to the address to store the PID. + +@return +#QURT_EOK -- Success +#QURT_EFATAL -- pid pointer passed as NULL + +@dependencies +None. +*/ +int qurt_process_get_pid(int client_handle, int * pid); + +/**@ingroup func_qurt_process_get_dm_status +Gets the debugging session status on the process represented by the pid argument. + +@param[in] pid Process ID +@param[in,out] status Address to store the status: \n + #QURT_DEBUG_NOT_START \n + #QURT_DEBUG_START + +@return +#QURT_EOK - Success \n +#QURT_EINVALID - Error + +@dependencies +None. +*/ +int qurt_process_get_dm_status( unsigned int pid, unsigned int *status); + + +/**@ingroup func_qurt_process_suspend_threads + Suspends user threads in a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in GuestOS/root process. + After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel + until they resume later. + + This function has one optional argument with one default option. + #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + This function call is a synchronous call, the function returns after the relevant threads are + completely suspended. + + If some user threads in the target user process are set as non-suspendable, this function call does + not suspend these threads. + + If the target user process is already suspended, this function call returns success as the + confirmation on the user process suspending. + + QuRT debugger monitor threads in the target user process are non-suspendable, this function call does + not suspend the threads. + + If the target user process is a secure user process, or a CPZ process, this function call returns error + without suspending the target user process. + + If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call + does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended + when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success while the user thread can be running in GuestOS, and is suspended + when exiting the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid process_id input \n + #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_suspend_threads (unsigned int process_id, unsigned int option); + + +/**@ingroup func_qurt_process_resume_threads + Resumes a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in the guest OS/root process. + After the user threads in the target user process resume, the kernel scheduler + can schedule the user threads to run based on their thread priorities. + + This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which + resumes user threads in the target user process. + + This is an asynchronous function, it returns after the kernel moves the user thread from + suspended state to runnable state. The threads are scheduled to run based on their thread priorities. + + This function call does not resume threads in the target user process that have been set as non-resumable. + + If the target user process have already resumed, this function call confirms that the user process resumes + by returning success. + + If the target user process is a secure user process or a CPZ process, this function call returns an error without + resuming operation. + + If user threads in the target user process run in the guest OS/root process via QDI call, this function + call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits + the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process. + + @return + #QURT_EOK -- Success + #QURT_EINVALID -- Failure because of invalid process_id input. + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_resume_threads (unsigned int process_id, unsigned int option); + +/**@ingroup func_qurt_process_vtcm_window_set + Set a VTCM access window for a process. + The caller thread needs to be in SRM process. + + This is an synchronous function, it ensures all running threads of the process have the requested + window in effect.The requested view for all non-running thread will take in effect when they get + scheduled. + + @param[in] pid Process identifier. + @param[in] enable QURT_VTCM_WINDOW_ENABLE enforces VTCM access window defined by high and low offset. + QURT_VTCM_WINDOW_DISABLE high and low offset is ignored and VTCM access is fully + disabled for the process. + @param[in] high_offset Specifies the high window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT restore high offset to reset value. + @param[in] low_offset Specifies the low window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value. + + @note1hang + when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT and low offset is set as + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled + via MMU mapping for the process. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset); + +/**@ingroup func_qurt_process_vtcm_window_get + Get the VTCM window for a process. + The caller thread needs to be in SRM process. + + + @param[in] pid Process identifier. + @param[out] enable address to store enable status if set + @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM + @param[out] low_offset address to return low window offset, in 4K increments, from the base address of the VTCM. + + @note1hang + User must first check the value of enable returned before checking high and low offset. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset); + +/**@ingroup func_qurt_process_set_group_config + Enable thread groups in the process with the ceiling priorities setup + + @param[in] process_id Process identifier. + @param[in] group_bitmask 64-bit mask of active thread groups + @param[in] ceiling_priorities array of ceiling priorities for thread group + + @note1hang + This API can only be called by root PD and can only be called once for each process, otherwise it will be + rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all + exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling + priority of group 0, it will be lowered to the ceiling value. + Examples 1: + group_bitmask = 0xD7; //'b11010111 + ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care + Exmaples 2: + group_mask = 0x5; //'b101 + ceiling_priorities[] = {240, 0, 20}; // 0 - does not care + + + @return + #QURT_EOK -- Success. + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_ENOTALLOWED -- The group has been configured already. + + @dependencies + None. + */ +int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask, + unsigned char *ceiling_priorities); + + +/**@ingroup func_qurt_process_stid_set + Set the specified stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[in] stid stid to be set + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level. + All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process. + When a non-default group_id is specified, the stid is set only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid); + +/**@ingroup func_qurt_process_stid_get + Get the stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[out] Pointer to a variable to return stid + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid. + When a non-default group_id is specified, the stid is returned only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_profile.h new file mode 100755 index 0000000000000..2a50c461440f6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_profile.h @@ -0,0 +1,98 @@ +#ifndef QURT_PROFILE_H +#define QURT_PROFILE_H +/** + @file qurt_profile.h + QuRT profiling support. + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup profiling_macros +@{ */ +#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */ +#define QURT_PROFILE_ENABLE 1 /**< Enable profiling. */ + +typedef unsigned int qurt_profile_param_t; + +#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */ + +/** @} */ /* end_addtogroup profiling_macros */ + +/** @addtogroup profiling_types + @{ */ +/** Profiling results. */ +typedef union +{ + /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME. */ + struct + { + unsigned int ticks; /**< Cumulative ticks the thread was ready. */ + } thread_ready_time; + +} qurt_profile_result_t; +/** @} */ /* end_addtogroup profiling_types */ + +/**@ingroup func_qurt_profile_enable2 + * Starts profiling of a specific parameter on a specific thread (as applicable). + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of the thread (if applicable) for which the specified + * paramter must be profiled. + * @param[in] enable #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- + * enable + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EALREADY -- Measurement already in progress or already stopped \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_enable2 ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + int enable +); + +/**@ingroup func_qurt_profile_get + * Gets the value of the profiling parameter that was previously enabled. + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of thread (if applicable) for which the specified + * profiling paramter must be retrieved. + * @param [out] result Profiling result associated with the parameter for the specified + * thread (if applicable). + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EFAILED -- Operation failed; profiling was not enabled \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_get ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + qurt_profile_result_t * result +); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ptrace.h new file mode 100755 index 0000000000000..622304dd92865 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ptrace.h @@ -0,0 +1,37 @@ +/*============================================================================= + + qurt_ptrace.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SYS_PTRACE_H__ +#define __SYS_PTRACE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +enum __ptrace_request +{ + /** + Indicates that the process making this request is requesting to be traced. + */ + PTRACE_TRACEME = 0, + PTRACE_EXT_IS_DEBUG_PERMITTED = 500 +}; + +long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data); + +#ifdef __cplusplus +} +#endif + +#endif //__SYS_PTRACE_H__ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi.h new file mode 100755 index 0000000000000..705408e5cfc6f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi.h @@ -0,0 +1,185 @@ +#ifndef QDI_H +#define QDI_H + +/** + @file qurt_qdi.h + @brief Prototypes of QuRT Driver Invocation API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_qdi_open + Opens the specified driver for subsequent operations. + qurt_qdi_open() is the primary mechanism by which a driver user can + obtain a QDI handle. The user provides the name of the driver to the + qurt_qdi_open call, and gets back a handle referencing + the named driver. \n + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_open function prototype is not actually defined as a varargs. + + + @param[in] p Driver name. + @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, + and should follow the POSIX open() convention. \n + - flags -- Optional second parameter (POSIX flags), the handle + access requested (read-only, write-only, or read-write, + for instance) and other flags such as whether the call + should create a new device or only open an existing + device. \n + - mode -- Optional third parameter (POSIX mode); permissions to + configure when a new device is created. @tablebulletend + + @return + Negative value -- Error. \n + Non-negative value -- Success, this result value serves as a handle to the + opened driver. + @dependencies + None. + */ +// int qurt_qdi_open(); +#define qurt_qdi_open(p,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__) + +#define qurt_qdi_open_dt(p,q,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__) + +/**@ingroup func_qurt_qdi_handle_invoke + Performs a generic driver operation, which (depending on the specified operation) can be + either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} + or a driver-specific operation. + The user provides a QDI handle and an integer + method number, along with 0 to 8 optional 32-bit arguments. + The device driver invocation function is invoked with the + same method number and 0 to 8 optional arguments. The + return value from the invocation function is passed back to + the user as the return value of qurt_qdi_handle_invoke. + + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_handle_invoke() function prototype is not actually defined as a + varargs function (and would break if it were defined this way). + + @param[in] h Driver handle. + @param[in] m Integer number for the operation to perform. + @param[in] ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n + arg1 -- First parameter \n + arg2 -- Second parameter \n + arg3 -- Third parameter \n + arg4 -- Fourth parameter \n + arg5 -- Fifth parameter \n + arg6 -- Sixth parameter \n + arg7 -- Seventh parameter \n + arg8 -- Eighth parameter + + @return + Integer value defined by the device driver. \n + -1 -- Error. + + @dependencies + None. + */ +// int qurt_qdi_handle_invoke(); +#define qurt_qdi_handle_invoke(h,m,...) \ + _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__) +#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c) +#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d)) +#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e)) +#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) +int qurt_qdi_qhi3(int,int,int); +int qurt_qdi_qhi4(int,int,int,int); +int qurt_qdi_qhi5(int,int,int,int,int); +int qurt_qdi_qhi6(int,int,int,int,int,int); +int qurt_qdi_qhi7(int,int,int,int,int,int,int); +int qurt_qdi_qhi8(int,int,int,int,int,int,int,int); +int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int); + +/**@ingroup func_qurt_qdi_write + Writes data to the specified driver. + A predefined invocation routine for drivers that + support a POSIX-like write functionality. + qqurt_qdi_write(handle, buf, len) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data to write is stored. + @param[in] len Number of bytes of data to write. + + @return + Non-negative integer -- Number of bytes written. \n + Negative error code -- Write could not take place. + + @dependencies + None. + */ +int qurt_qdi_write(int handle, const void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_read + User-visible API to read data from a QDI handle. + A predefined invocation routine for drivers that + support a POSIX-like read functionality. + qurt_qdi_read(handle, buf, len) is equivalent to: + qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data read is stored. + @param[in] len Number of bytes of data to read. + + @return + Non-negative integer number -- Bytes read. \n + Negative error code -- Read could not take place. + + @dependencies + None. + */ +int qurt_qdi_read(int handle, void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_close + Closes the specified driver, releasing any resources associated with the open driver. + User-visible API to close a QDI handle. + + This API should be called when the user is done using a + QDI-based handle. When this function is called, the driver can release + any resources held and perform other necessary cleanup + operations. qurt_qdi_close(handle) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle) + + @param[in] handle Driver handle. + + @return + 0 -- Success.\n + Negative error code -- Failure. + + @dependencies + None. + */ +int qurt_qdi_close(int handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_constants.h new file mode 100755 index 0000000000000..4866fada067f0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_constants.h @@ -0,0 +1,193 @@ +#ifndef QDI_CONSTANTS_H +#define QDI_CONSTANTS_H + +/** + @file qurt_qdi_constants.h + @brief Predefined invocation methods for drivers. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Method numbers used for QDI. +|| +|| Intended grouping of method numbers for QDI +|| including future usage: +|| +|| Method 0 should always be unused and not responded to by +|| any driver. +|| Methods 1 and 2 are reserved for name registration and +|| name lookup. +|| Methods 3 through 31 are reserved for POSIX-type operations +|| on open handles. +|| Methods 32 through 127 are reserved for the QDI infrastructure +|| and may be extended in the future to provide standard +|| driver debug services, management services, and system +|| notifications. +|| Methods 128 through 255 are reserved for the use of automatically +|| generated methods such as might be generated by an IDL (interface +|| definition language). The infrastructure may be extended to +|| perform services on these methods based on information provided +|| by the IDL, such as automatic buffer validation, etc. These +|| method numbers should not be used for any "ad hoc" methods. +|| Methods with number >= 256 are "private" method numbers that are +|| outside the scope of the QDI infrastructure. Drivers that want +|| to generate and consume their own "ad hoc" methods are free to +|| use these method numbers as they wish. The infrastructure does +|| not generate these method numbers or respond to them, but +|| passes them on unmolested. +|| +|| All driver implementations *should* return a value of +|| -1 when called with an unsupported method. The standard error +|| return value for POSIX APIs is -1, so we emulate that behavior +|| here. +*/ +/** @cond */ +#define QDI_UNUSED 0 +#define QDI_DEVNAME_REGISTER 1 +#define QDI_OPEN 2 +#define QDI_CLOSE 3 +#define QDI_READ 4 +#define QDI_WRITE 5 +#define QDI_IOCTL 6 +#define QDI_MMAP 7 +#define QDI_OS_FILEOPEN 8 +#define QDI_FLEN 9 +#define QDI_UNLINK 10 +#define QDI_FTELL 22 +#define QDI_SEEK 23 +#define QDI_FSTAT 24 + +#define QDI_FSNAME_REGISTER 150 +#define QDI_FS_OPEN 151 +#define QDI_MMAP2 153 +#define QDI_MPROTECT2 154 +#define QDI_MUNMAP2 155 + +#define QDI_CLIENT_HANDLE_OBJREF_GET 10 + +#define QDI_OS_PROCESS_LOAD 12 +#define QDI_OS_PROCESS_CHOOSE_ASID 13 + +#define QDI_OS_SET_GP 26 +#define QDI_CLIENT_HANDLE_CALLBACK 27 + +#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T 19 //reused +#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80 +#define QDI_CLIENT_HANDLE_HANDLE_RELEASE 81 +#define QDI_CLIENT_HANDLE_COPY_FROM_USER 82 +#define QDI_CLIENT_HANDLE_COPY_TO_USER 83 +#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE 86 +#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS 87 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK 41 +#define QDI_CLIENT_HLOSPOOL_INFO_GET 90 +#define QDI_CLIENT_HLOSPOOL2_INFO_GET 96 + +#define QDI_CLIENT_PID 44 +#define QDI_CLIENT_ASID QDI_CLIENT_PID + +#define QDI_OS_CLIENT_INFO_GET 48 + +#define QDI_OS_MEM_LOOKUP_PHYSADDR 57 + +#define QDI_OS_THREAD_ITERATOR_CREATE 68 +#define QDI_OS_THREAD_ITERATOR_NEXT 69 + +#define QDI_OS_SYSENV 78 + +#define QDI_REGION_USERMALLOC_INIT 180 // This method is for generic handle + + +#define QDI_CLIENT_HANDLE_USER_MALLOC 84 +#define QDI_CLIENT_HANDLE_USER_FREE 85 + +#define QDI_SIGNAL_GROUP_SIGNAL_CREATE 96 +#define QDI_SIGNAL_GROUP_WAIT 98 +#define QDI_SIGNAL_GROUP_POLL 99 +#define QDI_SIGNAL_SET 96 +#define QDI_SIGNAL_CLEAR 97 +#define QDI_SIGNAL_WAIT 98 +#define QDI_SIGNAL_POLL 99 + +#define QDI_OS_WAIT_FOR_MAIN_REAPER 104 + +#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL 105 +#define QDI_CLIENT_HANDLE_REFPROXY_ADD 106 +#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE 107 + +#define QDI_CLIENT_HANDLE_DETACH 116 + +#define QDI_OS_RESERVED1 139 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK2 142 + +#define QDI_DT_REGISTER 158 +#define QDI_OPEN_DEVICE 159 +#define QDI_OPEN_FROM_DT 160 + +#define QDI_PRIVATE 256 /* Method numbers beginning at 256 + are private method numbers, which + are device-specific and available + for use by device implementors. */ +/* +|| Permission bitmasks for use with qurt_qdi_lock_buffer(). +|| +|| Make sure these match with permission values from qurt_perm_t. +*/ +/** @endcond */ + +/** @addtogroup driver_support_constants +@{ */ +#define QDI_PERM_W 2 /**< Write access. */ +#define QDI_PERM_R 1 /**< Read access. */ +#define QDI_PERM_RW (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */ + +#define QDI_HANDLE_LOCAL_CLIENT 3 /**< Local client. */ +#define QDI_HANDLE_GENERIC 4 /**< Generic. */ + +#define QDI_REFCNT_BASE 0x510000 /**< */ +#define QDI_REFCNT_MAXED 0x51FFFD /**< */ +#define QDI_REFCNT_INIT 0x51FFFE /**< Driver object is temporary and is eventually deleted.*/ +#define QDI_REFCNT_PERM 0x51FFFF /**< Driver object is permanent and is never deleted. */ +/** @} */ /* end_addtogroup driver_support_constants */ + +/** @cond */ +/* +|| Flags used by process loaders. +*/ + +#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT 0x1 /* Set this flag to request the loaded process + to have island residency. */ +#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT 0x2 /* Set this flag to request the loaded process + to have root residency, for example, DL Pager. */ +/* +|| Constants used for qurt_event register API, type field. +*/ + +#define QURT_PROCESS_EXIT 1 + +/* +|| Constants used by QDI extensions. +*/ + +#define QURT_QDI_SINGLETON_TYPE_TRUE 0 +#define QURT_QDI_SINGLETON_TYPE_FALSE 1 +#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS 2 +/** @endcond */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QDI_CONSTANTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_driver.h new file mode 100755 index 0000000000000..e044e25f1bb72 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_driver.h @@ -0,0 +1,868 @@ +#ifndef QURT_QDI_DRIVER_H +#define QURT_QDI_DRIVER_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "stddef.h" +#include "qurt_qdi.h" +#include "qurt_types.h" +#include "qurt_callback.h" +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" +#include "qurt_mutex.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| This gives the canonical form for the arguments to a QDI +|| driver invocation function. The arguments are as follows: +|| +|| int client_handle (R0) QDI handle that represents the client +|| that made this QDI request. If the +|| client is remote, this is a +|| variable handle; if the client is local +|| (same thread and process), this is +|| set to QDI_HANDLE_LOCAL_CLIENT. +|| +|| qurt_qdi_obj_t *obj (R1) Points at the qdi_object_t structure +|| on which this QDI request is being made. +|| The qdi_object_t structure is usually +|| the first element of a larger structure +|| that contains state associated with the +|| object; because it is usually the first +|| element, the object pointers can be freely +|| interchanged through casts. +|| +|| int method (R2) Integer QDI method that represents +|| the request type. +|| +|| qurt_qdi_arg_t arg1 (R3) First three general purpose arguments +|| qurt_qdi_arg_t arg2 (R4) to the invocation function are passed in +|| qurt_qdi_arg_t arg3 (R5) these slots. +|| +|| qurt_qdi_arg_t arg4 (SP+0) Arguments beyond the first three are +|| qurt_qdi_arg_t arg5 (SP+4) passed on the stack. +|| qurt_qdi_arg_t arg6 (SP+8) +|| qurt_qdi_arg_t arg7 (SP+12) +|| qurt_qdi_arg_t arg8 (SP+16) +|| qurt_qdi_arg_t arg9 (SP+20) +|| +|| The canonical form of the invocation function takes a +|| total of 12 arguments, but not all of them are used. In general, +|| the QDI infrastructure only passes those arguments provided by +|| the caller; if the invocation function accesses additional +|| arguments beyond those provided by the caller, the values are not +|| useful. +*/ +/** @cond */ +#define QDI_INVOKE_ARGS \ + int, struct qdiobj *, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define QDI_EXT_INVOKE_ARGS \ + int, qurt_qdi_man_obj_t*, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define BUFFER_LOCK 1 +#define BUFFER_UNLOCK 0 + +struct qdiobj; +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef union { + void *ptr; /**< Pointer to the driver handle. */ + int num; /**< Method number. */ +} qurt_qdi_arg_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI driver version */ +typedef union { + int num; + struct { + short major; /** Driver major version number. */ + short minor; /** Driver minor version number. */ + }; +} qurt_qdi_version_t; + +typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS); +typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *); +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef struct qdiobj { + qurt_qdi_pfn_invoke_t invoke; /**< Invocation function that implements the driver methods.*/ + int refcnt; /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of + references to a driver instance. */ + qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance + of the driver object.*/ +} qurt_qdi_obj_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI managed object */ +typedef struct qurt_qdi_man_obj +{ + qurt_qdi_obj_t qdi_obj; + union + { + struct qurt_qdi_ext_driver * opener_obj; + struct qurt_qdi_ext_device * device_obj; + }; +}qurt_qdi_man_obj_t; + +typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS); +typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj); +typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device); + +typedef struct qurt_qdi_ext_obj_info{ + qurt_qdi_man_obj_t *obj; + int qdi_client_id; + struct qurt_qdi_ext_obj_info *next; +}qurt_qdi_ext_obj_info_t; +typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr; + +/** QuRT QDI device */ +//temporarily add this back while there are still drivers who statically define this structure +struct qurt_qdi_device { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; +}; +typedef struct qurt_qdi_device qurt_qdi_man_device; + +struct qurt_qdi_ext_driver { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + qurt_qdi_ext_pfn_create_device_t create_device; + qurt_qdi_version_t version; + qurt_qdi_ext_pfn_probe_t probe; + const char* compatible; + struct qurt_qdi_ext_device * device_list; + //qurt_qdi_ext_device_ptr device_list; +}; +typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t; +//above replaces qurt_qdi_man_device + +extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *); +extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *); + +extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS); +/** @endcond */ +/**@ingroup func_qurt_qdi_method_default + Processes a method that is unrecognized or unsupported in the driver invocation function. + All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded + to this function. + + @note1hang Invocation functions must process all unrecognized or unsupported methods + by calling this function. + + @return + None. + + @dependencies + None. +*/ +extern int qurt_qdi_method_default(QDI_INVOKE_ARGS); + +/**@ingroup func_qurt_qdi_handle_create_from_obj_t + Allocates a new device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[out] obj Pointer to the driver object. + + @return + Non-negative integer -- Success; this value is the new handle. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_invoke + Allocates a new island device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). + @param[in] obj Pointer. + + @return + Non-negative integer value that is the new handle -- Success. \n + Negative return value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_release + Deallocates the specified device handle. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] handle_to_release Handle to release. + + @return + 0 -- Success. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_RELEASE, + handle_to_release); +} + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_handle(int client_handle, int object_handle) +{ + qurt_qdi_obj_t *ret; + + ret = NULL; + + qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_OBJREF_GET, + object_handle, + &ret); + + return ret; +} + +/**@ingroup func_qurt_client_add_memory + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size); + +/**@ingroup func_qurt_client_add_memory2 + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting 36-bit address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size); + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr) +{ + qurt_qdi_obj_t * ret = NULL; + + if (qurt_qdi_obj_ref_inc(objptr) < 0) { + ret = NULL; + } else { + ret = objptr; + } + + return ret; +} + +static __inline void +qurt_qdi_objref_release(qurt_qdi_obj_t *objptr) +{ + if (qurt_qdi_obj_ref_dec(objptr) == 1) { + (*objptr->release)(objptr); + } +} + +/**@ingroup func_qurt_qdi_copy_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the driver buffer. + @param[in] src Base address of the user buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_FROM_USER, + dest, src, len); +} + +/**@ingroup qurt_qdi_copy_string_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param dest Base address of the driver buffer. + @param src Base address of the user buffer. + @param len Number of bytes to copy. NOTE: This is the destination buffer length. + + @return + Negative error result -- privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len); + +/**@ingroup func_qurt_qdi_copy_to_user + Copies the contents of a driver memory buffer to user memory. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the user buffer. + @param[in] src Base address of the driver buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_TO_USER, + dest, src, len); +} + +/**@ingroup func_qurt_qdi_safe_cache_ops + Do cache operations on user memory + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] addr Base address of the user memory. + @param[in] size Size of the user memory. + @param[in] opcode Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...) + @param[in] type Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE) + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size, + qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SAFE_CACHE_OPS, + addr, size, opcode, type); +} + + +/**@ingroup func_qurt_qdi_buffer_lock + Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI + client. + + This function is used to permit a trusted driver to safely access memory that is + provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + + This function performs the following security checks: \n + - Verifies that the entire buffer is accessible to the client. \n + - Ensures that the pointer remains valid for the remainder of the QDI driver + operation. \n + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] buf Pointer to the base address of the client buffer address. + @param[in] len Buffer length (in bytes). + @param[in] perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + - #QDI_PERM_R -- Read access \n + - #QDI_PERM_W -- Write access \n + - #QDI_PERM_RW -- Read/write access @tablebulletend + @param[out] obuf Pointer to the buffer address that the driver must use to access the buffer. + + @return + Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n + Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission. + In this case the QDI driver call must be terminated cleanly, with an appropriate error code + returned to the client. \n + Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the + pointer passed in as buf -- even if the user process changes the mapping of memory at buf, + the mapping of memory at *obuf remains valid until the driver invocation completes. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK, + buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_lock2 + Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI + client. + This API permits a trusted driver to safely access memory + provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + This function performs the following security checks: \n + -- Entire buffer is accessible to the client. \n + -- Entire buffer is mapped with permissions passed in perms field \n + -- Entire buffer is physically contiguous \n + In addition to the security checks, the API also locks the client mapping such that the client + cannot remove the mapping while the physical memory is used by the trusted + driver. \n + + @note1 Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not + pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client. If the client exits abruptly, the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Safe buffer + unmapping or user buffer unlock is not supported in Island mode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + -- #QDI_PERM_R -- Read access \n + -- #QDI_PERM_W -- Write access \n + -- #QDI_PERM_RW -- Read/write access \n + @param obuf Optional parameter that returns a pointer to the buffer address that + the driver must use to access the buffer. If NULL is passed, the API + only performs security checks and does not create a mapping to access the user buffer in + a safe way. + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n + QURT_EFAILED -- Mapping cannot be created for the trusted driver. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_LOCK, buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_unlock + This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping + created for the driver is removed. Client mapping for the user buffer is + unlocked. + + @note1 Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not + pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client, and if the client exits abruptly, all the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Actual + unmapping of driver accessible memory or unlocking of the buffer is not + supported in Island bode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param obuf Safe buffer address that was returned in the obuf field after calling + qurt_qdi_buffer_lock2(). + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. \n + other results -- Safe buffer unmapping failed or unlocking of user buffer failed \n. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len, + void *obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_UNLOCK, buf, len, obuf); +} + +/**@ingroup func_qurt_qdi_user_malloc + Allocates memory area in the QDI heap that is read/write accessible to both the driver and + the client. \n + @note1hang The QDI heap has a limited amount of memory available, and only the + device driver can free the allocated memory. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param size Size. + + @return + Non-zero -- Success; this returned value points to the allocated memory area. \n + Zero -- Error. + + @dependencies + None. +*/ +void *qurt_qdi_user_malloc(int client_handle, unsigned size); + +/**@ingroup func_qurt_qdi_user_free + Deallocates memory area in the QDI heap. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param ptr Pointer. + + @dependencies + None. +*/ +void qurt_qdi_user_free(int client_handle, void *ptr); + +/**@ingroup funct_qurt_qdi_client_detach + Detaches a client (a process), indicating that the client does not + participate in the qurt_wait() mechanism. This behavior + is opt-in and irrevocable. When a client is detached, it can + not be un-detached. + + @param client_handle Handle of the client to detach. + + @return + Zero -- Success. Detachable clients always return success. + Nonzero value -- client_handle did not refer to a + detachable user client. + + @dependencies + None. +*/ +static __inline int qurt_qdi_client_detach(int client_handle) +{ + return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH); +} + +/**@ingroup func_qurt_qdi_signal_group_create + Creates a new signal group for use in a device driver. + A QDI signal group contains up to 32 signals, which can be operated on either + individually (using the qurt_qdi_signal_* functions) or as a group (using the + qurt_qdi_signal_group_* functions). \n + @note1hang Driver implementation is responsible for using the proper signal group + handle in any given situation. \n + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param p_signal_group_handle_local Returns a handle intended for use by code that + resides in the same context and process as the created signal group + (for example, the device driver implementation that allocated the + signal group). + @param p_signal_group_handle_remote Returns a handle intended for use by code + that resides in a different context and process than the created signal group + (for example, the user-mode client of an OS driver). + + @return + Zero return value indicates success.\n + Negative return value indicates could not create signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_create(int client_handle, + int *p_signal_group_handle_local, + int *p_signal_group_handle_remote) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE, + p_signal_group_handle_local, + p_signal_group_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_group_wait + Suspends the current thread until any of the signals are set in the specified signal group. + + If a signal is set in a signal group object, and a thread waits on the signal group object, + the thread is awakened. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @param signal_group_handle Handle of the signal group. + + @return + If the client is remote: + QURT_EOK -- Wait complete \n + QURT_ECANCEL -- Wait cancelled.\n + If the client is local, returns a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_wait(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_group_poll + Returns a value that indicates if any of the signals are set in the specified signal group. + + @param signal_group_handle Handle of the signal group. + + @return + 1 -- Indicates whether any of the signals are set in the signal group.\n + 0 -- Indicates that none of the signals are set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_poll(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_POLL); +} + + +/**@ingroup func_qurt_qdi_signal_create + Creates a new signal in the specified signal group. + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @note1hang Driver implementation is responsible for using the proper signal handle in + any given situation. + + @param signal_group_handle Handle of an existing signal group. + @param p_signal_handle_local Returns a handle intended for use by code that resides in + the same context and process as the created signal (for example, + the device driver implementation that allocated the signal). + @param p_signal_handle_remote Returns a handle intended for use by code that resides in + a different context and process than the created signal (for + example, the user-mode client of an OS driver). + + @return + Nonzero value -- No more signals can be created in the specified + signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_create(int signal_group_handle, + int *p_signal_handle_local, + int *p_signal_handle_remote) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_SIGNAL_CREATE, + p_signal_handle_local, + p_signal_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_set + Sets the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_set(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_SET); +} + +/**@ingroup func_qurt_qdi_signal_clear + Clears the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_clear(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_CLEAR); +} + +/**@ingroup func_qurt_qdi_signal_wait + Suspends the current thread until the specified signal is set. + If a signal is set in a signal object, and a thread waits on the signal object, the + thread is awakened. If the awakened thread has higher priority than the current thread, a + context switch may occur. + + @param signal_handle Handle of the signal. + + @return + If client is remote: + QURT_EOK -- Wait complete. \n + QURT_ECANCEL -- Wait cancelled.\n + If client is local, return a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_wait(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_poll + Returns a value that indicates if the specified signal is set. + + @param signal_handle Handle of the signal. + + @return + 1 -- Signal is set. \n + 0 -- Signal is not set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_poll(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_POLL); +} + +/**@ingroup func_qurt_qdi_devname_register + Registers a QDI device with the generic QDI object in the + current QDI context. + + This function registers an exact name or a directory prefix with a QDI opener object. + Future invocations of qurt_qdi_open() in the context of the caller invokes the + opener object if a match is detected. + + Directory prefix names are specified by ending the name with a forward slash character. + + Example of an exact name: + @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode + + Example of a directory prefix: + @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode + + Given the two registrations shown above, the only qurt_qdi_open() requests to + direct to the foobar_opener object are requests for the exact name + "/dev/foobar", Any request beginning with "/pipedev/" is directed to the + pipedev_opener object. + + The pipedev invocation function presumably examines the name argument to + determine exactly how to handle the request. The name is passed to the invocation + function in the a1.ptr argument (Section @xref{sec:invocationFunction}). + + @param name Device name or device name prefix. + @param opener Pointer to the opener object for the device. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. + + @dependencies + None. + */ +static __inline int qurt_qdi_devname_register(const char *name, + qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, + QDI_DEVNAME_REGISTER, + name, + opener); +} + +// Macros for backward compatibility with deprecated APIs +// (These will go away soon) + +#define qurt_qdi_register_devname(name, opener) \ + qurt_qdi_devname_register((name), (void *)(opener)) +#define qurt_qdi_new_handle_from_obj_t(handle, obj) \ + qurt_qdi_handle_create_from_obj_t((handle), (obj)) +#define qurt_qdi_release_handle(client_handle, handle) \ + qurt_qdi_handle_release((client_handle), (handle)) +#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \ + qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf)) +#define qurt_qdi_usermalloc(handle, size) \ + qurt_qdi_user_malloc((handle), (size)) +#define qurt_qdi_userfree(handle, ptr) \ + qurt_qdi_user_free((handle), (ptr)) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_ext.h new file mode 100755 index 0000000000000..383e1799a15d6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_ext.h @@ -0,0 +1,58 @@ +#ifndef QURT_QDI_EXT_H +#define QURT_QDI_EXT_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct qurt_qdi_ext_device { + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + struct qurt_qdi_ext_device * next; + char * instance; + fdt_node_handle context; +}; +typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr; + +/**@ingroup func_qurt_qdi_dt_register + Registers a QDI device with the generic QDI object in the current QDI context, + if and only if a compatible device node is found in the device tree. This + function serves as a device tree aware wrapper for qurt_qdi_devname_register(). + + @param name Device name or device name prefix. + @param opener Pointer to QDI ext specialized opener object for the driver. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. +*/ +static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener); +} + +static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name) +{ + device->instance = name; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_imacros.h new file mode 100755 index 0000000000000..c0a8448ac87f8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_imacros.h @@ -0,0 +1,34 @@ +#ifndef QURT_QDI_IMACROS_H +#define QURT_QDI_IMACROS_H + +/** + @file qurt_qdi_imacros.h + @brief Internal macros used for QDI. Mostly consists of tricky (and ugly) + preprocessor hacks that permit us to do varargs function invocations + where we pass optional arguments in registers and where we can do + type casting and checking automatically. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define _QDMPASTE(a,b) _QDMPASTE_(a,b) +#define _QDMPASTE_(a,b) a##b +#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0) +#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_proxy.h new file mode 100755 index 0000000000000..f1d8992ea8811 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_proxy.h @@ -0,0 +1,55 @@ +/*============================================================================= + + qurt_qdi_proxy.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef _QURT_QDI_PROXY_H +#define _QURT_QDI_PROXY_H + +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* APIs allowing operation on the proxy object directly */ +int qurt_qdi_proxy_ref_create(void); + +/* APIs allowing to operate on proxy given a known proxy handle + * 1) using qdi handle of the object + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle); +int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle); + +/* 2) using object reference + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); +int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); + +/* API allowing to associate a proxy object with a particular client given a client handle + * successfule return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_install (int client_handle, int proxy_handle); + +/* APIs allowing operation on proxy object from user client + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_add(int qdi_handle); +int qurt_client_proxy_ref_remove(int qdi_handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_QDI_PROXY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex.h new file mode 100755 index 0000000000000..a013a0bbddb1d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_RMUTEX_H +#define QURT_RMUTEX_H +/** + @file qurt_rmutex.h + Prototypes of rmutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_rmutex_init + Initializes a recursive mutex object. + The recursive mutex is initialized in unlocked state. + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_destroy + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex protects, and continues executing. + + If a thread performs a lock operation on a mutex that is already use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock_timed + Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex is protecting, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked by itself. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + If timeout expires, this wait must be terminated and no access to the mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + + */ +int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + +/**@ingroup func_qurt_rmutex_unlock + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a mutex. When the mutex is + unlocked, the thread waiting on the mutex awakens. If the awakened + thread has higher priority than the current thread, a context switch occurs. + + @note1hang When a thread unlocks a recursive mutex, the mutex is not available until + the balanced number of locks and unlocks has been performed on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock + Attempts to lock the specified recursive mutex.\n + + If a thread performs a try_lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing.\n + If a thread performs a try_lock operation on a recursive mutex that another thread has + already locked, qurt_rmutex_try_lock immediately returns with a nonzero result + value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex_try_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock_block_once + Attempts to lock a mutex object recursively. If the mutex is available, + it locks the mutex. If the mutex is held by the current thread, + it increases the internal counter and returns 0. If not, it returns a + nonzero value. + If the mutex is already locked by another thread, the caller thread is + suspended. When the mutex becomes available again (because the other + thread has unlocked it), the caller thread is awakened and tries to lock + the mutex; and if it fails, this function returns failure with a nonzero + value. If it succeeds, this function returns success with zero. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the qurt_mutex_t object. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex2.h new file mode 100755 index 0000000000000..a37e7e4458c4b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex2.h @@ -0,0 +1,183 @@ +#ifndef QURT_RMUTEX2_H +#define QURT_RMUTEX2_H +/** + @file qurt_rmutex2.h + @brief Prototypes of rmutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT rmutex2 type. + Mutex type used with rmutex2 APIs. + */ +typedef struct { + /** @cond */ + unsigned int holder __attribute__((aligned(8))); /* UGP value of the mutex holder. */ + unsigned short waiters; /* Number of waiting threads. */ + unsigned short refs; /* Number of references to this mutex. */ + unsigned int queue; /* Kernel-maintained futex queuevalue. */ + unsigned int excess_locks; /* Number of excess times the holder has locked the mutex. */ + /** @endcond */ +} qurt_rmutex2_t; +/** @} */ /* end_addtogroup mutex_types */ +/** @cond internal_only*/ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_rmutex2_init + + @deprecated use #qurt_rmutex_init instead. + + Initializes a recursive mutex object. + + The recursive mutex is initially unlocked. + + Objects of type rmutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_destroy + + @deprecated use #qurt_rmutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code must destroy an rmutex2 object prior to + deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures + that all qurt_rmutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_lock + + @deprecated use #qurt_rmutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that the mutex protects, and continues + to execute. + + If a thread performs a lock operation on a recursive mutex that another thread is using, + the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_unlock + + @deprecated use #qurt_rmutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex awakens. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_rmutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() + succeeds immediately, this function behaves similarly, returning 0 for success. + When a call to qurt_rmutex2_lock() does not succeed immediately, this function has + no effect and returns nonzero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sclk.h new file mode 100755 index 0000000000000..a83cf5f1db889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sclk.h @@ -0,0 +1,145 @@ +#ifndef QURT_SCLK_H +#define QURT_SCLK_H +/** + @file qurt_sclk.h + @brief Header file describing the APIs supported by QuRT system SCLK + feature. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + + +/*============================================================================= + + INCLUDE FILES + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + + +/** + Conversion from microseconds to sleep ticks. + */ +#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL) +#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL) +#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL) +#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS +/** + Sleep timer error margin for Qtimer is 192 ticks ~10 us. +*/ +#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq; +#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN + +/*============================================================================= + + DATA DECLARATIONS + +=============================================================================*/ + +/**@ingroup func_qurt_sysclock_get_hw_ticks + @xreflabel{sec:qurt_sysclock_get_hw_ticks} + Gets the hardware tick count.\n + Returns the current value of a 64-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation must be used with care because of the wrap-around behavior. + + @return + Integer -- Current value of 64-bit hardware counter. + + @dependencies + None. + */ +unsigned long long qurt_sysclock_get_hw_ticks (void); + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_32 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_32} + Gets the hardware tick count in 32 bits.\n + Returns the current value of a 32-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 32 bits are the lower 32 bits of the Qtimer counter. + + @return + Integer -- Current value of the 32-bit timer counter. + + @dependencies + None. + */ +static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void) +{ + //Beginning with v61 there is a HW register that can be read directly. + unsigned long count; + __asm__ __volatile__ (" %0 = c30 " : "=r"(count)); + return count; +} + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_16 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_16} + Gets the hardware tick count in 16 bits.\n + Returns the current value of a 16-bit timer counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 16 bits are based on the value of the lower 32 bits in Qtimer + counter, right shifted by 16 bits. + + @return + Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the + Qtimer counter, right shifted by 16 bits. + + @dependencies + None. + */ + + +static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void) +{ + unsigned long ticks; + + //Beginning with v61 there is a HW register that can be read directly. + __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks)); + __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks)); + + return (unsigned short)ticks; +} +unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks); +#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif /* __cplusplus */ + +#endif /* QURT_SCLK_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_secure_proc.h new file mode 100755 index 0000000000000..f40c7deb9bca1 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_secure_proc.h @@ -0,0 +1,53 @@ +#ifndef QURT_SECURE_PROC_H +#define QURT_SECURE_PROC_H + +/** + @file qurt_secure_proc.h + @brief Definitions, macros, and prototypes used for handling secure process + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_process_migrate_secure_process + Migrate the user process to Qurt secure process + + @param secure_phy_address Physical starting address of secure memory + @param secure_memory_size Size of secure memory + @param entry Entry function to secure process + + @return + EOK + Negative return value -- Error. + + @dependencies + None. +*/ +int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size, void entry(unsigned)); + +/**@ingroup qurt_process_get_migration_mem_size + get the size of all writable memory regions in a user PD. This is for preparation on secure process migration. + + @return + size of all writable memory regions in a user PD. + + @dependencies + None. +*/ +int qurt_process_get_migration_mem_size(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sem.h new file mode 100755 index 0000000000000..ee5ce4b2d94ab --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sem.h @@ -0,0 +1,252 @@ +#ifndef QURT_SEM_H +#define QURT_SEM_H +/** + @file qurt_sem.h + Prototypes of semaphore API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup semaphore_types +@{ */ + +/** QuRT semaphore type. */ +typedef union { + /** @cond */ + unsigned int raw[2] __attribute__((aligned(8))); + struct { + unsigned short val; /**< */ + unsigned short n_waiting; /**< */ + unsigned int reserved1; /**< */ + unsigned int queue; /**< */ + unsigned int reserved2; /**< */ + }X; /** @endcond */ +} qurt_sem_t; +/** @} */ /* end_addtogroup semaphore_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_sem_add + Releases access to a shared resource (the specified amount increments the semaphore count value).\n + When a thread performs an add operation on a semaphore, the specified value increments the semaphore count. + The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing. \n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel repeatedly awakens the highest-priority waiting thread and decrements + the semaphore count value until either no waiting threads remain or the + semaphore count value is zero. If any of the awakened threads has higher priority + than the current thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] amt Amount to increment the semaphore count value. + + @return + Unused integer value. + + @dependencies + None. + + */ +int qurt_sem_add(qurt_sem_t *sem, unsigned int amt); + +/**@ingroup func_qurt_sem_up + Releases access to a shared resource. When a thread performs an up operation on a semaphore, + the semaphore count value increments. The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing.\n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel awakens the highest-priority waiting thread and decrements the + semaphore count value. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); } + +/**@ingroup func_qurt_sem_down + Requests access to a shared resource. When a thread performs a down operation on a + semaphore, the result depends on the semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +int qurt_sem_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_down_timed + When a thread performs a down operation on a semaphore, the result depends on the + semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. Terminate the wait when the specified timeout expires. + If timeout expires, terminate this wait and grant no access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration); + +/**@ingroup func_qurt_sem_try_down + @xreflabel{hdr:qurt_sem_try_down} + Requests access to a shared resource (without suspend). When a thread performs a try down + operation on a semaphore, the result depends on the semaphore count value: \n + - The count value is decremented when it is nonzero. The down operation returns 0 as + the function result, and the thread gains access to the shared resource and is free to + continue executing.\n + - The count value is not decremented when it is zero. The down operation returns -1 + as the function result, and the thread does not gain access to the shared resource + and should not continue executing. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + 0 -- Success. \n + -1 -- Failure. + + @dependencies + None. + + */ +int qurt_sem_try_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init + Initializes a semaphore object. + The default initial value of the semaphore count value is 1. + + @param[out] sem Pointer to the initialized semaphore object. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_destroy + Destroys the specified semaphore.\n + @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Semaphores must not be destroyed while they are still in use. If this occur, + the behavior of QuRT is undefined. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_sem_destroy(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init_val + Initializes a semaphore object with the specified value. + + @datatypes + #qurt_sem_t + + @param[out] sem Pointer to the initialized semaphore object. + @param[in] val Initial value of the semaphore count value. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val); + +/**@ingroup func_qurt_sem_get_val + Gets the semaphore count value.\n + Returns the current count value of the specified semaphore. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Integer semaphore count value + + @dependencies + None. + */ +static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;} +int qurt_sem_down_cancellable(qurt_sem_t *sem); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SEM_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_shmem.h new file mode 100755 index 0000000000000..980557323708a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_shmem.h @@ -0,0 +1,89 @@ +#ifndef QURT_SHMEM_H +#define QURT_SHMEM_H + +/** + @file qurt_shmem.h + + @brief + Prototypes of QuRT inter-process shared memory APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MODE_T +#define MODE_T +typedef unsigned int mode_t; +#endif //MODE_T + +/** + * The shm_open() function establishes a connection between a shared memory object and a file descriptor. + * The file descriptor is used by other functions such as mmap() to refer to that shared memory object. + * + * + * @param name Pointer to string naming a shared memory object. Name has to start with "/shm/" + * @param oflag File status flags and file access modes of the open file description. Following + * flags are defined in and supported: + * O_RDONLY: oepn for read access only + * O_RDWR: Open for read or write access + * O_CREAT: If shared memory object doesn't exist, create one. + * @param mode Permission flags (currently ignored) + * + * @return file descriptor (positive number) if operation successful. + * negative error code if failed + * +*/ + +int shm_open(const char * name, int oflag, mode_t mode); + +/** + * The shm_mmap() function create a shared memory mapping in the virtual address space of the + * the calling process. + * + * @param addr The starting address for the new mapping is specified in addr. + * @param len Specifies the lengh of the shared memory region. + * @param prot Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX. + * @param flags Determines whether updates to the mapping is visible or not to other process. Same as + * the one in mmap of POSIX. + * @param fd The starting adddress for the new mapping is returned. + * @param offset unused. + * + * @return The starting adddress for the new mapping is returned. + * negative error code if failed + * +*/ + +void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset); + +/** + * The shm_close() function removes a connection between a shared memory object and a file descriptor. + * If there is no file descriptor connects to the shared memory object, the shared memory object will + * be deleted automatically. Shared memory object has same virtual address in any process. This is + * restriction of single virtual address space. + * + * + * @param fd File descriptor of shared memory object + * + * @return 0 if operation successful. + * negative error code if failed + * +*/ + + +int shm_close(int fd); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal.h new file mode 100755 index 0000000000000..3a89c53394ad5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal.h @@ -0,0 +1,518 @@ +#ifndef QURT_SIGNAL_H +#define QURT_SIGNAL_H + +/** + @file qurt_signal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup signals_types +@{ */ +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 /**< Wait any. */ +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 /**< Wait all. */ + +/*===================================================================== + Typedefs + ======================================================================*/ + + +/** QuRT signal type. + */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int signals; + unsigned int waiting; + unsigned int queue; + unsigned int attribute; + }X; + /** @endcond */ +} qurt_signal_t; + + +/** QuRT 64-bit signal type. + */ +typedef struct { + /** @cond */ + qurt_signal_t signal_sum; + unsigned long long signals; + unsigned long long waiting; + /** @endcond */ +} qurt_signal_64_t; +/** @} */ /* end_addtogroup signals_types */ +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal_init + Initializes a signal object. + Signal returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_init(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_destroy + Destroys the specified signal object. + + @note1hang Signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_destroy(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting on the signal. + + If a thread is waiting on a signal object for any of the specified set of signals to set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared when the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread waits to set any of the signals, or to set all of + them. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_timed + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set or until timeout. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared after the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value that identifies the individual signals in the signal object to wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] signals Bitmask of signals that are set + @param[in] duration Duration (microseconds) to wait. Must be in the range + [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION] + + @return + #QURT_EOK -- Success; one or more signals were set \n + #QURT_ETIMEDOUT -- Timed-out \n + #QURT_EINVALID -- Duration out of range + + @dependencies + Timed-waiting support in the kernel. +*/ +/* ======================================================================*/ +int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, unsigned int *signals, unsigned long long int duration); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_any + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on the thread. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_all + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to set the signal, and 0 indicates not to set it. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_set(qurt_signal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 32-bit word with current signals + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_get(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_clear + Clear signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal_wait_cancellable + @xreflabel{hdr:qurt_signal_wait_cancellable} + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, + unsigned int *return_mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_init + Initializes a 64-bit signal object.\n + The signal argument returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore. + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_init(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_destroy + Destroys the specified signal object. + + @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_destroy(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_wait + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value, which identifies the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set it. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifiying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_64_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 64-bit double word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_clear + Clears signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask); + +#ifdef __cplusplus +} +#endif + +#endif /* QURT_SIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal2.h new file mode 100755 index 0000000000000..43975100cbf75 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal2.h @@ -0,0 +1,340 @@ +#ifndef QURT_SIGNAL2_H +#define QURT_SIGNAL2_H + +/** + @file qurt_signal2.h + @brief Prototypes of kernel signal2 API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 + +/*===================================================================== + Typedefs + ======================================================================*/ + +/** @addtogroup signals2_types +@{ */ +/** qurt_signal2 type. + */ +typedef union { + /** @cond */ + struct{ + unsigned int cur_mask; /* Current set of signal bits that are set. */ + unsigned int sig_state; /* Current state. */ + /* Bit 0 -- in anysignal wait. */ + /* Bit 1 -- in allsignal wait. */ + /* Bit 2 -- in interrupt wait. */ + /* Bits 31-3 -- reference count field. */ + unsigned int queue; /* Kernel-maintained futex queue value. */ + unsigned int wait_mask; /* When sig_state indicates a waiter is present, this is the wait mask. */ + }; + unsigned long long int raw; + /** @endcond */ +} qurt_signal2_t; +/* @} */ /* end_addtogroup signals2_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_init + + @deprecated use #qurt_signal_init instead. + + Initializes a signal2 object. + Signal returns the initialized object. + The signal object is initially cleared. + + Objects of type signal2 solve a potential race condition between + set() and destroy() operations. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + Each mutex-based object has an associated + kernel resource(s), therefore users must call qurt_signal2_destroy() + when this object no longer in use. + */ +/* ======================================================================*/ +void qurt_signal2_init(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_destroy + + @deprecated use #qurt_signal_destroy instead. + + Destroys the specified signal object. + + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont Application code should destroy a signal2 object prior to deallocating it. + Calling qurt_signal2_destroy() before deallocating a + signal2 object ensures completion of all qurt_signal2_set() calls. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_destroy(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait + + @deprecated use #qurt_signal_wait instead. + + Suspends the current thread until the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when + any of the signals specified in the mask are set. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only + when all the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to wait on. + @param[in] attribute Specifies whether the thread waits for any of the signals to be set, or for all of + them to be set. Values:\n + - QURT_SIGNAL_ATTR_WAIT_ANY \n + - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_any + + @deprecated use #qurt_signal_wait_any instead. + + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened when any of the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_all + + @deprecated use #qurt_signal_wait_all instead. + + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened only when all the signals specified in the mask are set. + + @note1hang At most one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_set + + @deprecated use #qurt_signal_set instead. + + Sets signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_get + + @deprecated use #qurt_signal_get instead. + + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the signal object to access. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_get(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_clear + + @deprecated use #qurt_signal_clear instead. + + Clear signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal2_wait_cancellable + + @deprecated use #qurt_signal_wait_cancellable instead. + + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +int qurt_signal2_wait_cancellable(qurt_signal2_t *signal, + unsigned int mask, + unsigned int attribute, + unsigned int *p_returnmask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SIGNAL2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_space.h new file mode 100755 index 0000000000000..2c3f9e4496697 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_space.h @@ -0,0 +1,230 @@ +#ifndef QURT_SPACE_H +#define QURT_SPACE_H +/** + @file qurt_space.h + @brief Prototypes of QuRT process control APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** This flag is a request to the OS to suspend the processes just before calling main() +But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */ +#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP + +/** + * Creates and starts a process from ELF of a specified name. The slash symbols + * "/" or "\" are ignored. Do not include the directory name in the input. This function + * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags. + * + * @param name ELF name of the executable. Name shall not contain directories, + * use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf" + * + * @param return + Process ID -- Success \n + Negative error code -- failure\n + #QURT_EPRIVILEGE -- Caller does not have enough privilege for this operation\n + #QURT_EMEM -- Not enough memory to perform the operation \n + #QURT_EFAILED -- Operation failed \n + #QURT_ENOTALLOWED -- Operation not allowed \n + #QURT_ENOREGISTERED -- Not registered \n + #QURT_ENORESOURCE -- Resource exhaustion \n + #QURT_EINVALID -- Invalid argument value +*/ + +int qurt_spawn_flags(const char * name, int flags); + +/** + Creates and starts a process from an ELF of the specified name. The slash symbols + "/" or "\" are ignored. Do not include the directory name in the input. + + @param name ELF name of the executable. Name shall not contain directories, + use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf". + + @return + Process ID -- Success. \m + Negative error code -- Failure. + +*/ +static inline int qurt_spawn(const char *name) +{ + return qurt_spawn_flags(name,0); +} + +/** + * Returns the process ID of the current process. + * + * @return + * Process ID + * +*/ +#define qurt_getpid qurt_process_get_id + +/** + * The qurt_wait() function waits for status change in a child process. It could be used by parent + * process to block on any child process terminates. + * + * This API returns error if there are no user processes or all user processes got detached. + * + * @param status Pointer to status variable. The variable provides the status value of child process. + * The value comes from exit() system call made by child process. + * + * @return + Process ID of the child process that changes status -- Success \n + * Negative error code -- Failure + * +*/ + +int qurt_wait(int *status); + + +/** @cond */ +/* APIs that allow registering callbacks on spawn of user pd */ +typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr); //no return, since we won't be error checking it in spawn +typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info); +typedef union { + QURT_SPAWN_PFN spawn_pfn; + QURT_CB_PFN cb_pfn; +} qurt_process_callback_pfn_t; +/** @endcond */ + +/** @cond internal_only */ + +/**@ingroup func_qurt_event_register +Sets the specified bits by mask in the signal passed by the caller. The signal gets set +when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal +to be set. + +@datatypes + +@param[in] type QURT_PROCESS_EXIT is the only event that can be registered for. +@param[in] value Indicates the client handle of the process for which the event is registered. +@param[in] signal Pointer to the signal object to set when the event occurs. +@param[in] mask Mask bits to set in the signal. +@param[out] data Pointer to the variable that would receive the exit code of the exiting process. +@param[in] datasize Size of the data variable. + +@return +#QURT_EOK -- Success \n +#QURT_EMEM -- Not enough memory to allocate resources \n +#QURT_EVAL -- Invalid values passed to the API + +@dependencies +None. +*/ +int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size); + +/**@ingroup func_qurt_callback_register_onspawn +Allows registering for a callback on spawn of any user process. + +@datatypes +#QURT_SPAWN_PFN + +@param[in] pFn Callback function to call when any user process is spawned. +@param[in] user_data Pointer to the argument that the callback must be called with. + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data); + +/**@ingroup func_qurt_callback_deregister_onspawn +Allows de-registering callback on spawn. + +@param[in] callback_handle Handle returned by qurt_callback_register_onspawn. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_callback_deregister_onspawn(int callback_handle); + +/**@ingroup func_qurt_process_callback_register +Allows registering for a callback during or after image loading. +Generic callback types: + Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is + loaded, before process thread starts. Callback has no return value and has no info provided + from OS. + pFn - QURT_SPAWN_PFN + type - QURT_PROCESS_CB_GENERIC + arg1 - not used + arg2 - not used + arg3 - not used +Note callback types: + Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP), + or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info + argument in callback is populated with pointer to the mapped note corresponding to the callback. + Callback has return value, loader fails if callback returns a value that is not QURT_EOK. + pFn - QURT_CB_PFN + type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP + arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO) + arg2 - note name + arg3 - not used + +@datatypes + +@param[in] pFn Callback function to call +@param[in] type Callback type +@param[in] user_data Pointer to the argument that the callback must be called with. +@param[in] arg1 Arguments interpreted by OS based on callback type +@param[in] arg2 Arguments interpreted by OS based on callback type +@param[in] arg3 Arguments interpreted by OS based on callback type (currently not used) + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, + qurt_process_cb_type_t type, + void *user_data, + qurt_process_callback_arg_t arg1, + qurt_process_callback_arg_t arg2, + qurt_process_callback_arg_t arg3); + + + +/**@ingroup func_qurt_process_callback_deregister +Allows de-registering callback for imate loading. +@param[in] callback_handle Handle returned by qurt_process_callback_register. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_process_callback_deregister(int callback_handle); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SPACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_consts.h new file mode 100755 index 0000000000000..48a8b6a38c402 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_consts.h @@ -0,0 +1,32 @@ +#ifndef QURT_SRM_CONSTS_H +#define QURT_SRM_CONSTS_H +/** + @file qurt_srm_consts.h + @brief Type definitions for srm + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2020-2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +#define QURT_SRM_WAKEUP_REQUEST 1U << 0 /**< Value = 1: Send wakeup request to the SRM server. */ +#define QURT_SRM_SET_HANDLE 1U << 1 /**< Value = 2: Set the client handle for a new SRM client. */ +#define QURT_SRM_ALLOC_KERNEL_PAGES 1U << 2 /**< Value = 4: Allocate pages from the kernel VA space. */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_driver.h new file mode 100755 index 0000000000000..5489e3dddbcca --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_driver.h @@ -0,0 +1,140 @@ +#ifndef QURT_SRM_DRIVER_H +#define QURT_SRM_DRIVER_H +/** + @file qurt_srm_driver.h + @brief Definitions, macros, and prototypes used by SRM drivers. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + + =============================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Define qurt_srm_driver_t structure, which represents +|| the "registration" object for an SRM driver. +*/ +/** @cond internal_only */ +struct _qurt_srm_driver { + const char *name; + qurt_qdi_obj_t *obj; +}; + +typedef struct _qurt_srm_driver qurt_srm_driver_t; + +/* +|| qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke(). +|| It behaves the same, but it takes a QDI object pointer instead of a handle. +*/ + +#define qurt_srm_object_invoke(o,m,...) \ + _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__) +#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c) +#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d)) +#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e)) +#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) + +int qurt_srm_oi3(int, qurt_qdi_obj_t *, int); +int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int); +int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int); +int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int); +int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int); +int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int); +int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int); +int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int); +int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int); +int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int); + +#define QDI_SRM_INIT 192 + +/* +|| QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure. +|| +|| The three arguments are: +|| unique_id -- Unique C identifier, unused but must be a unique global symbol. +|| name -- Name of the driver by which an SRM client attempts to open it. +|| obj -- Pointer to the singleton object of the driver, which handles things such as +|| initialization and QDI_OPEN requests. +*/ + +#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \ + __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \ + { .name = xname, .obj = xobj } + + +/*@ingroup func_qurt_srm_mapping_create + Creates a memory mapping in pagetable with specified attributes + + @param[in] client_handle Client handle representing the process for which + mapping would be created. + @param[in] pageno_virt pointer to the virtual page. NULL indicates SRM + would indicate the virtual memory. + @param[in] pageno_phys physical page to be used for the mapping + @param[in] page_count number of 4k pages to be mapped + @param[in] cache_attr cache attributes for the mapping + @param[in] perm permissions to be used for the mapping + + @return value greater than 0 indicates a handle which can be passed to + qdi_close() to remove the mapping. Negative value indicates + an error. + + @dependencies + None. +*/ +int qurt_srm_mapping_create(int client_handle, + unsigned *pageno_virt, + unsigned pageno_phys, + unsigned page_count, + qurt_mem_cache_mode_t cache_attr, + qurt_perm_t perm); + + +/**@ingroup func_qurt_srm_get_pid + Gets the PID for the client_handle that is passed. + + @param[in] client_handle Client handle for which PID is required. + + @return PID of the client + Negative PID value '-1' will be returned in case of Error + + @dependencies + None. +*/ +unsigned qurt_srm_get_pid(int client_handle); + + +/*@ingroup func_qurt_srm_get_thread_id + Gets the thread id of the client requesting a service from SRM + + @param[in] None. + + @return thead id of client thread + + @dependencies + None. +*/ +qurt_thread_t qurt_srm_get_client_thread_id(void); + +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_DRIVER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_stid.h new file mode 100755 index 0000000000000..379f46aaa4b80 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_stid.h @@ -0,0 +1,73 @@ +#ifndef QURT_STID_H +#define QURT_STID_H +/** + @file qurt_stid.h + Prototypes of software thread identifier(stid) interface APIs. + A stid is 8 bit identifier that can be assigned to a software thread. + The performance monitor logic uses stid as a counting match criteria + for maskable events. stid is also used by the hardware debugger + (ISDB) to match breakpoints. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2024 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_stid_alloc + Allocate a unique stid + + @param[in] pid Process identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - Allocation success + QURT_ENORESOURCE - No stid available for allocation + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_stid_alloc(unsigned int pid, unsigned int *stid); + +/**@ingroup func_qurt_stid_release + Release the stid. + + + @param[in] pid Process identifier + @param[in] stid STID to release + + @note1hang + User shall ensure to clear the released stid from process or thread(s) + to default value (QURT_STID_DEFAULT) before releasing that stid + + @return + QURT_EOK - Release success + QURT_ENOTALLOWED - Operation not allowed for a pid + QURT_EINVALID - Invalid stid + + @dependencies + None. + */ +int qurt_stid_release(unsigned int pid, unsigned int stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_STID_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread.h new file mode 100755 index 0000000000000..499699e7c72e2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread.h @@ -0,0 +1,1260 @@ +#ifndef QURT_THREAD_H +#define QURT_THREAD_H +/** + @file qurt_thread.h + @brief Prototypes of Thread API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2020-2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +#include +#include "qurt_pmu.h" +#include "qurt_api_version.h" +#endif /* __ASSEMBLER__ */ +#include "qurt_consts.h" +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/* + Bitmask configuration to select DSP hardware threads. + To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL + and the following: \n + - For QDSP6 V2/V3, all six hardware threads are selected \n + - For QDSP6 V3L, all four hardware threads are selected \n + - For QDSP6 V4, all three hardware threads are selected + */ + +#define QURT_THREAD_CFG_BITMASK_HT0 0x00000001 /**< HTO. */ +#define QURT_THREAD_CFG_BITMASK_HT1 0x00000002 /**< HT1. */ +#define QURT_THREAD_CFG_BITMASK_HT2 0x00000004 /**< HT2. */ +#define QURT_THREAD_CFG_BITMASK_HT3 0x00000008 /**< HT3. */ +#define QURT_THREAD_CFG_BITMASK_HT4 0x00000010 /**< HT4. */ +#define QURT_THREAD_CFG_BITMASK_HT5 0x00000020 /**< HT5. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{sec:qurt_thread_cfg} */ + +#define QURT_THREAD_CFG_BITMASK_ALL 0x000000ffU /**< Select all the hardware threads. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_CFG_USE_RAM 0x00000000 /**< Use RAM. */ +#define QURT_THREAD_CFG_USE_TCM 0x00000100 /**< Use TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_BUS_PRIO_DISABLED 0 /**< Thread internal bus priority disabled. */ +#define QURT_THREAD_BUS_PRIO_ENABLED 1 /**< Thread internal bus priority enabled. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_AUTOSTACK_DISABLED 0 /**< Thread has autostack v2 feature disabled. */ +#define QURT_THREAD_AUTOSTACK_ENABLED 1 /**< Thread has autostack v2 feature enabled. */ + +/* + Macros for QuRT thread attributes. + */ +#define QURT_HTHREAD_L1I_PREFETCH 0x1 /**< Enables hardware L1 instruction cache prefetching. */ +#define QURT_HTHREAD_L1D_PREFETCH 0x2 /**< Enables hardware L1 data cache prefetching. */ +#define QURT_HTHREAD_L2I_PREFETCH 0x4 /**< Enables hardware L2 instruction cache prefetching. */ +#define QURT_HTHREAD_L2D_PREFETCH 0x8 /**< Enables hardware L2 data cache prefetching. */ +#define QURT_HTHREAD_DCFETCH 0x10 /**< Enables DC fetch to the provided virtual address. + DC fetch indicates the hardware that a data memory access is likely. + Instructions are dropped when there is high bus utilization. */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{hdr:partition_tcm} */ +/* + Below value is used to create legacy QuRT threads by default. + If a thread has this as the detach_state, the thread can be joined + on until it exits. When we are able to change default behavior of all + QuRT threads to JOINABLE (posix default), we can remove this legacy + behavior. +*/ +#define QURT_THREAD_ATTR_CREATE_LEGACY 0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */ +#define QURT_THREAD_ATTR_CREATE_JOINABLE 1U /**< Create a joinable thread. */ +#define QURT_THREAD_ATTR_CREATE_DETACHED 2U /**< Create a detached thread. */ +/** @} */ /* end_addtogroup thread_macros */ + + +#define QURT_THREAD_ATTR_NAME_MAXLEN 16 /**< Maximum name length. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_RAM 0 /**< Creates threads in RAM/DDR. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_TCM 1 /**< Creates threads in TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT QURT_THREAD_ATTR_TCB_PARTITION_RAM /**< Backward compatibility. */ +#define QURT_THREAD_ATTR_PRIORITY_DEFAULT 254 /**< Priority.*/ +#define QURT_THREAD_ATTR_ASID_DEFAULT 0 /**< ASID. */ +#define QURT_THREAD_ATTR_AFFINITY_DEFAULT (-1) /**< Affinity. */ +#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT 255 /**< Bus priority. */ +#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT 0 /**< Default autostack v2 disabled thread. */ +#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT (-2) /**< Timetest ID. */ +#define QURT_THREAD_ATTR_STID_DEFAULT QURT_STID_DEFAULT /**< STID. */ +#define QURT_THREAD_ATTR_STID_ENABLE 1 /**< Indicate to allocate STID during thread creation. */ + +#define QURT_PRIORITY_FLOOR_DEFAULT 255U /**< Default floor. */ +/** @} */ /* end_addtogroup thread_macros */ + +// Option for suspending thread +#define QURT_THREAD_SUSPEND_SYNCHRONOUS 0x0U // bit#0 +#define QURT_THREAD_SUSPEND_ASYNCHRONOUS 0x1U // bit#0 +#define QURT_THREAD_SUSPEND_KEEP_HMX 0x0U // bit#1 +#define QURT_THREAD_SUSPEND_DETACH_HMX 0x2U // bit#1 + +// Option for resuming thread +#define QURT_THREAD_RESUME_DEFAULT 0x0 + +// Thread property IDs +#define QURT_THREAD_PROPERTY_SUSPENDABLE 0x0U +#define QURT_THREAD_PROPERTY_RESUMABLE 0x1 + +// Thread group +#define QURT_THREAD_DEFAULT_GROUP_ID 0x0U +#define QURT_THREAD_GROUP_ID_MASK 0x3FU + +/** @endcond*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup thread_types +@{ */ +/** @cond rest_reg_dist */ +typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */ + +#define CCCC_PARTITION 0U /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */ +#define MAIN_PARTITION 1U /**< Use the main partition. */ +#define AUX_PARTITION 2U /**< Use the auxiliary partition. */ +#define MINIMUM_PARTITION 3U /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */ +/** @endcond */ + +/** Thread ID type. */ +typedef unsigned int qurt_thread_t; + +/** @cond rest_reg_dist */ +/** Thread attributes. */ +typedef struct _qurt_thread_attr { + + char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */ + unsigned char tcb_partition; /**< Indicates whether the thread TCB resides in RAM or + on chip memory (TCM). */ + unsigned char stid; /**< Software thread ID used to configure the stid register + for profiling purposes. */ + unsigned short priority; /**< Thread priority. */ + unsigned char autostack:1; /**< Autostack v2 enabled thread. */ + unsigned char group_id:6; /**< Group ID. */ + unsigned char reserved:1; /**< Reserved bits. */ + unsigned char bus_priority; /**< Internal bus priority. */ + unsigned short timetest_id; /**< Timetest ID. */ + unsigned int stack_size; /**< Thread stack size. */ + void *stack_addr; /**< Pointer to the stack address base. The range of the stack is + (stack_addr, stack_addr+stack_size-1). */ + unsigned short detach_state; /**< Detach state of the thread. */ + +} qurt_thread_attr_t; +/** @endcond */ + +/** @cond rest_reg_dist */ +/** Dynamic TLS attributes. */ +typedef struct qurt_tls_info { + unsigned int module_id; /**< Module ID of the loaded dynamic linked library. */ + unsigned int tls_start; /**< Start address of the TLS data. */ + unsigned int tls_data_end; /**< End address of the TLS RW data. */ + unsigned int tls_end; /**< End address of the TLS data. */ +}qurt_tls_info; +/** @endcond */ + +/** @} */ /* end_addtogroup thread_types */ + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_thread_attr_init + Initializes the structure used to set the thread attributes when a thread is created. + After an attribute structure is initialized, Explicity set the individual attributes in the structure + using the thread attribute operations. + + The initialize operation sets the following default attribute values: \n + - Name -- NULL string \n + - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT + - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n + - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n + - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n + - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n + - stack_size -- 0 \n + - stack_addr -- NULL \n + - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n + - STID -- #QURT_THREAD_ATTR_STID_DEFAULT + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr) +{ + + attr->name[0] = '\0'; + attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT; + attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT; + attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/ + attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT; + attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT; + attr->stack_size = 0; + attr->stack_addr = NULL; + attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY; + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID; +} + +/**@ingroup func_qurt_thread_attr_set_name + Sets the thread name attribute.\n + This function specifies the name to use by a thread. + Thread names identify a thread during debugging or profiling. + Maximum name length is 16 charactes \n + @note1hang Thread names differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] name Pointer to the character string containing the thread name. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name) +{ + strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN); + attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0'; +} + + +/**@ingroup func_qurt_thread_attr_set_tcb_partition + Sets the thread TCB partition attribute. + Specifies the memory type where a TCB of a thread is allocated. + Allocates TCBs in RAM or TCM/LPM. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] tcb_partition TCB partition. Values:\n + - 0 -- TCB resides in RAM \n + - 1 -- TCB resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition) +{ + attr->tcb_partition = tcb_partition; +} + +/**@ingroup func_qurt_thread_attr_set_priority + Sets the thread priority to assign to a thread. + Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing + the highest priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] priority Thread priority. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority) +{ + attr->priority = priority; +} + +/**@ingroup func_qurt_thread_attr_set_detachstate + Sets the thread detach state with which thread is created. + Thread detach state is either joinable or detached; specified by the following values: + - #QURT_THREAD_ATTR_CREATE_JOINABLE \n + - #QURT_THREAD_ATTR_CREATE_DETACHED \n + + When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread + ID and other resources are reclaimed as soon as the thread exits. When a joinable thread + is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some + thread waits to join on it using a qurt_thread_join() call. + By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY + If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other + thread can join before thread exits but it will not wait other thread to join. + + @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very + important that some thread joins on it after it terminates, otherwise + the resources of that thread are not reclaimed, causing memory leaks. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] detachstate Thread detach state. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate) +{ + if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){ + attr->detach_state = detachstate; + } +} + + +/**@ingroup func_qurt_thread_attr_set_timetest_id + Sets the thread timetest attribute.\n + Specifies the timetest identifier to use by a thread. + + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] timetest_id Timetest identifier value. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id) +{ + attr->timetest_id = timetest_id; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute.\n + Specifies the size of the memory area to use for a call stack of a thread. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_size Size (in bytes) of the thread stack. + + @return + None. + + @dependencies + None. +*/ + +static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size) +{ + attr->stack_size = stack_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size2 + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size + defined in the configuration XML.\n + Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] user_stack_size Size (in bytes) of the stack usage in User mode. + @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size) +{ + union qurt_thread_stack_info{ + unsigned int raw_size; + struct{ + unsigned short user_stack; + unsigned short root_stack; + }; + }user_root_stack_size; + user_root_stack_size.user_stack = user_stack_size; + user_root_stack_size.root_stack = root_stack_size; + + attr->stack_size = user_root_stack_size.raw_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_addr + @xreflabel{sec:set_stack_addr} + Sets the thread stack address attribute. \n + Specifies the base address of the memory area to use for a call stack of a thread. + + stack_addr must contain an address value that is 8-byte aligned. + + The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a + call stack for the thread. \n + @note1hang The user is responsible for allocating the memory area used for the thread + stack. The memory area must be large enough to contain the stack that the thread + creates. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_addr Pointer to the 8-byte aligned address of the thread stack. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr) +{ + attr->stack_addr = stack_addr; +} + +/**@ingroup func_qurt_thread_attr_set_bus_priority + Sets the internal bus priority state in the Hexagon core for this software thread attribute. + Memory requests generated by the thread with bus priority enabled are + given priority over requests generated by the thread with bus priority disabled. + The default value of bus priority is disabled. + + @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. + The priority is not propagated to the bus fabric. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + + @param[in] bus_priority Enabling flag. Values: \n + - #QURT_THREAD_BUS_PRIO_DISABLED \n + - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority) +{ + attr->bus_priority = (unsigned char)bus_priority; +} + +/**@ingroup func_qurt_thread_attr_set_autostack + Enables autostack v2 feature in the thread attributes. + + When autostack is enabled by the subsystem, in the case that + an autostack enabled thread gets framelimit exception, kernel will + allocate more stack for thread and return to normal execution. + + If autostack is not enabled by the subsystem, or it is not enabled + for the thread, the framelimit exception will be fatal. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] autostack Autostack enable or disable flag. Values: \n + - #QURT_THREAD_AUTOSTACK_DISABLED \n + - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack) +{ + attr->autostack = (unsigned char)autostack; +} +/**@ingroup qurt_thread_attr_enable_stid + Set STID in the thread attributes. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] enable_stid STID to be set. Values: \n + - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n + - #QURT_THREAD_ATTR_STID_ENABLE (1): QuRT assigns an STID that is not already in use \n + - #2 through #255 : User provided STID. @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid) +{ + if (enable_stid != '\0') { + attr->stid = enable_stid; + } + else + { + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + } +} + +/**@ingroup func_qurt_thread_attr_set_stid + Sets the stid thread attribute. + The default stid value is QURT_THREAD_ATTR_STID_DEFAULT + + @note1hang When a thread is created with non default stid , + the stid set in thread attribute will be assigned to a thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] stid Stid to be set for a thread. + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){ + attr->stid = stid; +} + +/**@ingroup func_qurt_thread_attr_set_group_id + Sets group id in the thread attributes. + Primordial/first thread has group ID 0. + If a new thread is created without assigning group_id, it + inherits the group ID from its parent thread. + + @note1hang + 1) Group ID can only be set before creating a thread. It cannot be + changed after the thread is created. + 2) If a non-activated group_id is passed, thread creation will fail. + 3) Only a thread with Group ID #0 can set Group ID for its child threads. + 4) If thread with non-zero group ID set the group ID for its child threads, + QuRT will ingore this parameter and child threads will inherit the parent + thread's group ID. But if passed group ID is not activated, thread creation + will still fail. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] group_id Group identifier. Its valid range is 0 ~ 63 + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id) +{ + attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK; +} + +/**@ingroup func_qurt_thread_set_autostack + Sets autostack enable in the TCB. + + @param[in] Pointer to UGP + + @return + None. + + @dependencies + None. +*/ + +void qurt_thread_set_autostack(void *); + + +/**@ingroup func_qurt_thread_get_name + Gets the thread name of current thread.\n + Returns the thread name of the current thread. + Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names + identify a thread during debugging or profiling. + + @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored. + @param[in] max_len Maximum length of the character string that can be returned. + + @return + None. + + @dependencies + None. +*/ +void qurt_thread_get_name (char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_create + @xreflabel{hdr:qurt_thread_create} + Creates a thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + @param[in] entrypoint C function pointer, which specifies the main function of a thread. + @param[in] arg Pointer to a thread-specific argument structure + + + @return + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg); + +/**@ingroup func_qurt_thread_stop + Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. + + @return + void + + @dependencies + None. + */ +void qurt_thread_stop(void); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_resume + When a demand-loading paging solution is enabled, this function + will resumes the execution of a thread that was suspended due to + a page miss. + + @param[in] thread_id Thread identifier. + + @return + #QURT_EOK -- Thread successfully resumed. \n + #QURT_EFATAL -- Resume operation failed. + + @dependencies + None. + */ +int qurt_thread_resume(unsigned int thread_id); +/** @endcond */ + +/**@ingroup func_qurt_thread_get_id + Gets the identifier of the current thread.\n + Returns the thread identifier for the current thread. + + @return + Thread identifier -- Identifier of the current thread. + + @dependencies + None. + */ +qurt_thread_t qurt_thread_get_id (void); + + +/**@ingroup func_qurt_thread_get_l2cache_partition + Returns the current value of the L2 cache partition assigned to the caller thread.\n + + @return + Value of the #qurt_cache_partition_t data type. + + @dependencies + None. + */ +qurt_cache_partition_t qurt_thread_get_l2cache_partition (void); + +/**@ingroup func_qurt_thread_set_timetest_id + Sets the timetest identifier of the current thread. + Timetest identifiers are used to identify a thread during debugging or profiling.\n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @param[in] tid Timetest identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_timetest_id (unsigned short tid); + +/**@ingroup func_qurt_thread_set_cache_partition + Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type + to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache. + + @datatypes + #qurt_cache_partition_t + + @param[in] l1_icache L1 I cache partition. + @param[in] l1_dcache L1 D cache partition. + @param[in] l2_cache L2 cache partition. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache); + + +/**@ingroup func_qurt_thread_get_timetest_id + Gets the timetest identifier of the current thread.\n + Returns the timetest identifier of the current thread.\n + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @return + Integer -- Timetest identifier. + + @dependencies + None. + */ +unsigned short qurt_thread_get_timetest_id (void); + +/**@ingroup func_qurt_thread_exit + @xreflabel{sec:qurt_thread_exit} + Stops the current thread, awakens threads joined to it, then destroys the stopped + thread. + + Threads that are suspended on the current thread (by performing a thread join + Section @xref{sec:thread_join}) are awakened and passed a user-defined status value + that indicates the status of the stopped thread. + + @note1hang Exit must be called in the context of the thread to stop. + + @param[in] status User-defined thread exit status value. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_exit(int status); + +/**@ingroup func_qurt_thread_join + @xreflabel{sec:thread_join} + Waits for a specified thread to finish; the specified thread is another thread within + the same process. + The caller thread is suspended until the specified thread exits. When the unspecified thread + exits, the caller thread is awakened. \n + @note1hang If the specified thread has already exited, this function returns immediately + with the result value #QURT_ENOTHREAD. \n + @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish. + If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}). + + @param[in] tid Thread identifier. + @param[out] status Destination variable for thread exit status. Returns an application-defined + value that indicates the termination status of the specified thread. + + @return + #QURT_ENOTHREAD -- Thread has already exited. \n + #QURT_EOK -- Thread successfully joined with valid status value. + + @dependencies + None. + */ +int qurt_thread_join(unsigned int tid, int *status); + +/**@ingroup qurt_thread_detach + @xreflabel{sec:thread_detach} + Detaches a joinable thread. The specified thread is another thread within the + same process. Create the thread as a joinable thread; only joinable threads + can be detached. + If a joinable thread is detached, it finishes execution and exits. + + @param[in] tid Thread identifier. + + @return + #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n + #QURT_EOK -- Thread successfully detached. + + @dependencies + None. + */ +int qurt_thread_detach(unsigned int tid); + + +/**@ingroup func_qurt_thread_get_priority + Gets the priority of the specified thread. \n + Returns the thread priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. \n + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + + @return + -1 -- Invalid thread identifier. \n + 1 through 254 -- Thread priority value. + + @dependencies + None. + */ +int qurt_thread_get_priority (qurt_thread_t threadid); + +/**@ingroup func_qurt_thread_set_priority + Sets the priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. For more + information, see Section @xref{sec:AppDev}. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + @param[in] newprio New thread priority value. + + @return + 0 -- Priority successfully set. \n + -1 -- Invalid thread identifier. \n + + @dependencies + None. + */ +int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio); + + + +/**@ingroup func_qurt_thread_attr_get + Gets the attributes of the specified thread. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[out] attr Pointer to the destination structure for thread attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid argument. + + @dependencies + None. + */ +int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr); + + + +/**@ingroup func_qurt_thread_get_tls_base + Gets the base address of thread local storage (TLS) of a dynamically loaded module + for the current thread. + + @datatypes + #qurt_tls_info + + @param[in] info Pointer to the TLS information for a module. + + @return + Pointer to the TLS object for the dynamically loaded module.\n + NULL -- TLS information is invalid. + + @dependencies + None. + */ +void * qurt_thread_get_tls_base(qurt_tls_info* info); + +/**@ingroup func_qurt_thread_pktcount_get + Gets the PKTCOUNT of a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + PKTCOUNT + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_get (qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_pktcount_set + Sets the PKTCOUNT for the current QuRT thread. + + @return + Value to which pktcount is set. + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_set (long long int); + +/**@ingroup func_qurt_thread_stid_get + Gets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + STID + + @dependencies + None. + */ + +char qurt_thread_stid_get(qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_stid_get2 + Returns the set stid for a thread + + @param[in] thread_id thread identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - success + QURT_ENOTALLOWED - operation not allowed for a thread + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid); + +/**@ingroup func_qurt_thread_stid_set + Sets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] stid Thread identifier. + + @return + #QURT_EOK -- STID set created. \n + #QURT_EFAILED -- STID not set. + + @dependencies + None. + */ + +int qurt_thread_stid_set(char stid); + +/**@ingroup qurt_thread_stid_set2 + Sets the stid for a specified thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[in] stid Stid to be set for a thread. + + @return + QURT_EOK -- Success + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_EVAL -- Failure because of invalid inputs. + + @dependencies + None. +*/ +int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_get_running_ids + Returns the thread IDs of the running threads in the system; use only during fatal error handling. + + @datatypes + #qurt_thread_t + + @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1. + + @return + #QURT_EINVALID -- Incorrect argument \n + #QURT_ENOTALLOWED -- API not called during error handling \n + #QURT_EOK -- Success, returns a NULL-terminated array of thread_id + + @dependencies + None. + */ +int qurt_thread_get_running_ids(qurt_thread_t *); +/** @endcond */ + + +/**@ingroup func_qurt_thread_get_thread_id + Gets the thread identifier of the thread with the matching name in the same process + of the caller. + + @datatypes + #qurt_thread_t + + @param[out] thread_id Pointer to the thread identifier. + @param[in] name Pointer to the name of the thread. + + @return + #QURT_EINVALID -- No thread with matching name in the process of the caller \n + #QURT_EOK -- Success + + @dependencies + None. + */ +int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name); + +/**@ingroup func_qurt_sleep + Suspends the current thread for the specified amount of time. + + @note1hang Because QuRT timers are deferrable, this call is guaranteed to block + at least for the specified amount of time. If power-collapse is + enabled, the maximum amount of time this call can block depends on + the earliest wakeup from power-collapse past the specified duration. + + @param[in] duration Duration (in microseconds) for which the thread is suspended. + + @return + None. + + @dependencies + None. + */ +void qurt_sleep (unsigned long long int duration); + + +/**@ingroup func_qurt_system_set_priority_floor + Sets a priority floor to move threads with thread priority lower than the floor out of the running state. + Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they + are not scheduled to run when the thread priority is lower than the floor. + Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. + Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor. + + The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and + sets a new floor, the new floor is associated to its original user process, not the QuRTOS process. + The floor associated to the user process is reset when the user process exits or is killed, but not at the time + when the user thread of the caller exits. + + The priority floor cannot be set to a priority higher than the thread priority of the caller. + + The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor. + + This function is not supported in Island mode. + + After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task + is not scheduled to run. + + @param[in] priority_floor Priority floor. + + @return + #QURT_EOK -- Success \n + #QURT_ENOTALLOWED -- Floor setting is not allowed + + @dependencies + None. + */ +int qurt_system_set_priority_floor (unsigned int priority_floor); + + +/**@ingroup func_qurt_thread_suspend_thread + Suspend a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent process. + After the target thread is suspended, the kernel will not schedule it to run until it is resumed later. + + If the target thread is set as non-suspendable, this function call returns an error without suspending + the target thread. + + If the target thread is already suspended, this function call returns success to confirm + the target thread suspend. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + suspending the target thread. + + If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend + the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is + suspended when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target + thread can runn in the guest OS, and is suspended when exiting the guest OS. + + QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend + those threads. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, multiple options can be ORed. \n + #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call, + the function returns after the thread is completely suspended.\n + #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns + after the kernel acts to suspend the target thread. The target thread + might still be running before it is completely suspended. \n + #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread + if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n + #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock(). + Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only + supported for the caller from the same user process of the target thread, not for a caller from the parent + process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX + context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations + and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option. + If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this + case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended + state without HMX detached. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process. + #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread. + + @dependencies + None. + */ +int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_resume_thread + Resume a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent + process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on + the thread priority. + + There is an option argument in this function, with only one default option as of now, + QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way. + + By default, this is an asynchronous function. The function returns after kernel moves the + target thread from suspended state to runnable state. The thread is scheduled to run based on its + thread priority. + + If the target thread is set as non-resumable, this function call does not resume the target thread. + + If the target thread has already resumed, this function confirms that the target thread resumes + by returning success. + + If the target thread is in a secure user process or CPZ process, this function call returns an error without + resuming the operation. + + If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of + suspend-pending on the target thread, and the target thread is not suspended when it exits the + guest OS. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + #QURT_EHMXNOTAVAIL -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume. + + @dependencies + None. + */ +int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_set_thread_property + Set a QuRT thread property with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be from the same user process of the target thread, or from its parent process. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + changing the property of the target thread. + + @param[in] thread_id Thread identifier \n + @param[in] property_id Thread property identifier \n + #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n + #QURT_THREAD_PROPERTY_RESUMEABLE -- thread is resumable. Default is TRUE + @param[in] value Proper value: \n + TRUE(1) -- TRUE for the property \n + FALSE(0) -- FALSE for the property + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value ); + +/**@ingroup func_qurt_thread_get_group_id + Get the group id of the thread specified by thread_id.\n + + @param[in] thread_id Thread identifier + @param[out] group_id Pointer to the variable of group identifier + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Thread id is invalid, or the process has no groups enabled \n + #QURT_ENOTALLOWED -- Operation is not allowed \n + + @dependencies + None. +*/ +int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id); + +#endif /* __ASSEMBLER__ */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread_context.h new file mode 100755 index 0000000000000..bab09deec8889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread_context.h @@ -0,0 +1,234 @@ +#ifndef QURT_THREAD_CONTEXT_H +#define QURT_THREAD_CONTEXT_H +/** + @file qurt_thread_context.h + @brief Kernel thread context structure + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond internal_only */ + +#define THREAD_ITERATOR_END ((qurt_thread_t)(-1)) /**< Thread iterator is complete. */ + + +/**@ingroup func_qurt_thread_iterator_create +Gives the ability to the caller to enumerate threads in the system. + +@return +Handle of the newly created iterator must be passed for +subsequent operations on the iterator. + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_create(void) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE); +} + +/**@ingroup func_qurt_thread_iterator_next +Iterates over the list of threads in the system. + +@datatypes +#qurt_thread_t + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n +Other values indicate a valid thread_id. + +@dependencies +None. +*/ +static inline qurt_thread_t qurt_thread_iterator_next(int iter) +{ + return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT); +} + +/**@ingroup func_qurt_thread_iterator_destroy +Cleans up thread iterator resources. + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#QURT_EOK -- Successful completion of operation \n +#QURT_EFATAL -- Invalid handle passed + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_destroy(int iter) +{ + return qurt_qdi_close(iter); +} + +/**@ingroup func_qurt_thread_context_get_tname +Gets the name of the thread from the specified thread ID. + +@param[in] thread_id Thread for which name is returned. +@param[in,out] name Pointer to the local buffer where name is copied back. +@param[in] max_len Size of the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_context_get_prio +Gets the priority for the specified thread. + +@param[in] thread_id Thread for which priority is returned. +@param[in,out] prio Pointer to the local variable where priority is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio); + +/**@ingroup func_qurt_thread_context_get_pcycles +Gets pcycles for the specified thread. + +@param[in] thread_id Thread for which processor cycles are returned. +@param[in,out] pcycles Pointer to the local variable where processor cycles are written. + +@return +#QURT_EOK -- Success \n +Failure otherwise. + +@dependencies +None. +*/ +int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles); + +/**@ingroup func_qurt_thread_context_get_stack_base +Gets the stack base address for the specified thread. + +@param[in] thread_id Thread for which stack base address is returned. +@param[in,out] sbase Pointer to the local variable where stack base address is written. + +@return +QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase); + +/**@ingroup func_qurt_thread_context_get_stack_size +Gets the stack size for the specified thread. + +@param[in] thread_id Thread for which stack size is returned. +@param[in,out] ssize Pointer to the local variable where stack size is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize); + +/**@ingroup func_qurt_thread_context_get_pid +Gets the process ID for the specified thread. + +@param[in] thread_id Thread for which process ID is returned. +@param[in,out] pid Pointer to the local variable where process id is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid); + +/**@ingroup func_qurt_thread_context_get_pname +Gets the process name for the specified thread. + +@param[in] thread_id Represents the thread for which process name is returned. +@param[in, out] name Pointer to the local buffer where process name is copied back. +@param[in] len Length allocated to the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len); + +/** @addtogroup thread_types +@{ */ +/** Structure that defines how TCB is interpreted to crash dump tools.*/ +/* Keys are defined in consts.h */ +struct qurt_debug_thread_info { +/** @cond */ + char name[QURT_MAX_NAME_LEN]; /**< Name of the thread. */ + struct { + unsigned key; + unsigned val; + } os_info[40]; + unsigned gen_regs[32]; /**< General mode registers. */ + unsigned user_cregs[32]; /**< User mode registers. */ + unsigned guest_cregs[32]; /**< Guest mode registers. */ + unsigned monitor_cregs[64]; /**< Monitor mode registers. */ +/** @endcond */ +}; /* should add up to 1K */ +/** @} */ /* end_addtogroup thread_types */ + + +/**@ingroup func_qurt_system_tcb_dump_get +Cleans up thread iterator resources. + +@datatypes +#qurt_thread_t + +@param[in] thread_id Thread on which the operation must be performed. +@param[in, out] ptr Pointer to the local buffer where contents are written. +@param[in] size Size of the debug thread information structure obtained by calling + qurt_system_tcb_dump_get_size(). + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_CONTEXT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_timer.h new file mode 100755 index 0000000000000..7bdfdb8f3c3df --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_timer.h @@ -0,0 +1,560 @@ +#ifndef QURT_TIMER_H +#define QURT_TIMER_H +/** + @file qurt_timer.h + @brief Prototypes of qurt_timer API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include "qurt_anysignal.h" +#include "qurt_signal2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@addtogroup timer_const_macros +@{ */ +/** + Default values. +*/ +/** @xreflabel{hdr:QURT_TIMER_ONESHOT}*/ +#define QURT_TIMER_DEFAULT_TYPE QURT_TIMER_ONESHOT /**< One shot.*/ +#define QURT_TIMER_DEFAULT_DURATION 1000uL /**< Default duration. */ +#define QURT_TIMER_DEFAULT_EXPIRY 0uL /**< Default expiration. */ + +/** + Conversion from microseconds to timer ticks. + */ +#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** Minimum microseconds value is 100 microseconds (sleep timer).*/ +#define QURT_TIMER_MIN_DURATION 100uL + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_TIMER_MAX_DURATION QURT_SYSCLOCK_MAX_DURATION + +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS + +/** + Sleep timer error margin for Qtimer is 1,000 ticks ~52 us. +*/ +#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN + +/* + qurt_timer group defines. +*/ +#define QURT_TIMER_MAX_GROUPS 5U /**< Maximum groups.*/ +#define QURT_TIMER_DEFAULT_GROUP 0U /**< Default groups. */ +/** @} */ /* end_addtogroup timer_const_macros */ + +/** @addtogroup timer_types +@{ */ +/** + QuRT timer types. + */ +typedef enum +{ + QURT_TIMER_ONESHOT = 0, /**< One shot.*/ + /** @xreflabel{hdr:QURT_TIMER_PERIODIC}*/ + QURT_TIMER_PERIODIC /**< Periodic. */ +} qurt_timer_type_t; + + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT timer type.*/ +typedef unsigned int qurt_timer_t; + +/** QuRT timer duration type. */ +typedef unsigned long long qurt_timer_duration_t; + +/** QuRT timer time type. */ +typedef unsigned long long qurt_timer_time_t; + +typedef void (*pfn_t)(void); +/** QuRT timer attribute type. */ +typedef struct +{ + /** @cond */ + unsigned int magic; /**< Magic number to verify the qmsgq_attr_t pointer. */ + + qurt_timer_duration_t duration; /**< Specifies the duration of the new timer. */ + + qurt_timer_time_t expiry; /**< Specifies the absolute expiry of the new timer. */ + + qurt_timer_duration_t remaining; /**< Specifies the remaining time of an active timer. */ + + qurt_timer_type_t type; /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and + #QURT_TIMER_PERIODIC are supported. */ + + unsigned int group; /**< Group number of the timer; the criterion used to disable or enable the set + of timers. */ + pfn_t pFn; /**< Callback other than the signal set */ + /** @endcond */ +} +qurt_timer_attr_t; + +/** @} */ /* end_addtogroup timer_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_timer_stop + @xreflabel{sec:qurt_timer_stop} + Stops a running timer. + The timer must be a one-shot timer. + + @note1hang Restart stopped timers with the timer restart operation, + see Section @xref{sec:qurt_timer_restart}. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n + #QURT_EMEM -- Out of memory error. + + @dependencies + None. + */ +int qurt_timer_stop (qurt_timer_t timer); + +/**@ingroup func_qurt_timer_restart + @xreflabel{sec:qurt_timer_restart} + Restarts a stopped timer with the specified duration. The timer must be a one-shot timer. + Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop(). + A restarted timer expires after the specified duration, the starting time is when the function is called. + + @note1hang Timers stop after they have expired or after they are explicitly + stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}. + + @datatypes + #qurt_timer_t \n + #qurt_timer_duration_t + + @param[in] timer Timer object. + @param[in] duration Timer duration (in microseconds) before the restarted timer + expires again. + The valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n + #QURT_EMEM -- Out-of-memory error. + + @dependencies + None. + */ +int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration); + + +/**@ingroup func_qurt_timer_create + Creates a timer.\n + Allocates and initializes a timer object, and starts the timer. + + @note1hang A timer event handler must be defined to wait on the specified signal + to handle the timer event. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t \n + #qurt_anysignal_t + + @param[out] timer Pointer to the created timer object. + @param[in] attr Pointer to the timer attribute structure. + @param[in] signal Pointer to the signal object set when timer expires. + @param[in] mask Signal mask, which specifies the signal to set in the signal object when the + time expires. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to create the timer. \n + #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n + Other error code -- Operation failed. \n + + @dependencies + None. + */ +int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_anysignal_t *signal, unsigned int mask); + +int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_timer_attr_init + Initializes the specified timer attribute structure with default attribute values: \n + - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n + - Timer type -- #QURT_TIMER_ONESHOT \n + - Timer group -- #QURT_TIMER_DEFAULT_GROUP + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_init(qurt_timer_attr_t *attr); + + +/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020 +@ingroup func_qurt_timer_attr_set_pfn + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + @param[in] pFn pFn. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn); + + +/**@ingroup func_qurt_timer_attr_set_duration + Sets the timer duration in the specified timer attribute structure.\n + + The timer duration specifies the interval (in microseconds) between the creation of the + timer object and the generation of the corresponding timer event. + + The timer duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] duration Timer duration (in microseconds). + Valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_attr_set_expiry + Sets the absolute expiry time in the specified timer attribute structure.\n + The timer expiry specifies the absolute time (in microseconds) of the generation of the + corresponding timer event.\n + Timer expiries are relative to when the system first began executing. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_time_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] time Timer expiry. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time); + +/**@ingroup func_qurt_timer_attr_get_duration + Gets the timer duration from the specified timer attribute structure. + The value returned is the duration that was originally set for the timer. + + @note1hang This function does not return the remaining time of an active timer; + use qurt_timer_attr_get_remaining() to get the remaining time. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attributes object + @param[out] duration Pointer to the destination variable for timer duration. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration); + +/**@ingroup func_qurt_timer_attr_get_remaining + Gets the timer remaining duration from the specified timer attribute structure. \n + + The timer remaining duration indicates (in microseconds) how much time remains before + the generation of the next timer event on the corresponding timer. + In most cases this function assumes that the timer attribute structure was obtained by + calling qurt_timer_get_attr(). + + @note1hang This attribute is read-only and thus has no set operation defined for it. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attribute object. + @param[out] remaining Pointer to the destination variable for remaining time. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining); + +/**@ingroup func_qurt_timer_attr_set_type + Sets the timer type in the specified timer attribute structure. + + The timer type specifies the functional behavior of the timer: \n + - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration + and then generates a single timer event. After this the timer is nonfunctional. \n + - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified + timer duration and then generates a timer event. The result is a series of timer + events with interval equal to the timer duration. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] type Timer type. Values are: \n + - #QURT_TIMER_ONESHOT -- One-shot timer. \n + - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type); + +/**@ingroup func_qurt_timer_attr_get_type + Gets the timer type from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] type Pointer to the destination variable for the timer type. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type); + +/**@ingroup func_qurt_timer_attr_set_group + Sets the timer group identifier in the specified timer attribute structure.\n + The timer group identifier specifies the group that the timer belongs to. Timer groups are + used to enable or disable one or more timers in a single operation. \n + The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1). + See Section @xref{dox:timers}. + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the timer attribute object. + @param[in] group Timer group identifier; + Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1). + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group); + +/**@ingroup func_qurt_timer_attr_get_group + Gets the timer group identifier from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] group Pointer to the destination variable for the timer group identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group); + +/**@ingroup func_qurt_timer_get_attr + @xreflabel{hdr:qurt_timer_get_attr} + Gets the timer attributes of the specified timer when it was created. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t + + @param[in] timer Timer object. + @param[out] attr Pointer to the destination structure for timer attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr); + +/**@ingroup func_qurt_timer_delete + Deletes the timer.\n + Destroys the specified timer and deallocates the timer object. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_delete(qurt_timer_t timer); + +/**@ingroup func_qurt_timer_sleep + Suspends the current thread for the specified amount of time. + The sleep duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). + + @datatypes + #qurt_timer_duration_t + + @param[in] duration Interval (in microseconds) between when the thread is suspended + and when it is re-awakened. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to perform the operation. + + @dependencies + None. + */ + +int qurt_timer_sleep(qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_group_disable + Disables all timers that are assigned to the specified timer group. + If a specified timer is already disabled, ignore it. + If a specified timer is expired, do not process it. + If the specified timer group is empty, do nothing. + + @note1hang When a timer is disabled its remaining time does not change, thus it + cannot generate a timer event. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_disable (unsigned int group); + +/**@ingroup func_qurt_timer_group_enable + Enables all timers that are assigned to the specified timer group. + If a specified timer is already enabled, ignore it. + If a specified timer is expired, process it. + If the specified timer group is empty, do nothing. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_enable (unsigned int group); + + +/** + Notifies the timer server recovery from power collapse. The server + must account for any missed interrupts during power collapse. + */ +void qurt_timer_recover_pc (void); + +/** + Determines whether the Qtimer is initialized. + + @return + 0 -- Not initialized. \n + Nonzero -- Initialized. + */ +static inline int qurt_timer_is_init (void) {return 1;} + +/**@ingroup func_qurt_timer_get_ticks + Gets current ticks. The ticks are accumulated since the RTOS + has started. Each tick is equal to a single timer clock + cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer. + + @return + Ticks since system started. + */ +unsigned long long qurt_timer_get_ticks (void); + +#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TIMER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tlb.h new file mode 100755 index 0000000000000..b1b2d261d31c0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tlb.h @@ -0,0 +1,215 @@ +#ifndef QURT_TLB_H +#define QURT_TLB_H + +/** + @file qurt_tlb.h + @brief Prototypes of TLB API + The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. + Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed + by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. + In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently + assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. + A new entry is placed in the first available slot. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tlb_entry_create + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (such as if the address is not aligned with the + size), the entry is created and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr Physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry is not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_create_64 + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (the address is not aligned with the + size), the entry is not created, and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the asid argument to -1. + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr_64 64-bit physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry was not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_delete + Deletes the specified TLB entry from the TLB of the Hexagon processor. + If the specified entry does not exist, no deletion occurs and an error result is returned. + + @param[in] entry_id TLB entry identifier. + + @return + #QURT_EOK -- TLB entry successfully deleted. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_delete (unsigned int entry_id); + +/**@ingroup func_qurt_tlb_entry_query + Searches for the specified TLB entry in the TLB of the Hexagon processor. + If the TLB entry is found, its entry identifier is returned. + + @datatypes + #qurt_addr_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid); + +/**@ingroup func_qurt_tlb_entry_set + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[in] entry 64-bit TLB entry to store. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry); + +/**@ingroup func_qurt_tlb_entry_get + Gets the TLB entry. \n + Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[out] entry 64-bit TLB entry. + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry); + +/**@ingroup func_qurt_tlb_get_pager_physaddrs + Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_phys_addrs Pointer to the return array of pager physical addresses. + + @return + Integer -- Number of addresses returned in array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs); + +/**@ingroup func_qurt_tlb_get_pager_virtaddr + Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_virt_addrs Pointer to the return array of pager virtual addresses. + + @return + Integer -- Number of addresses returned in the array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs); + + +/**@ingroup func_qurt_tlb_entry_set2 + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. An additional option can be passed + to lock the TLB entry in the TLB of the Hexagon processor. + + @param[in] id TLB entry identifier. + @param[in] tlb 64-bit TLB entry to store. + @param[in] lock Nonzero value indicates that the TLB entry must be locked in the hardware TLB. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLB_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tls.h new file mode 100755 index 0000000000000..6ec3b39ff5cb0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tls.h @@ -0,0 +1,100 @@ +#ifndef QURT_TLS_H +#define QURT_TLS_H +/** + @file qurt_tls.h + @brief Prototypes of TLS APIs + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tls_create_key + @xreflabel{sec:tls_create_key} + Creates a key for accessing a thread local storage data item.\n + Subsequent get and set operations use the key value. + + @note1hang The destructor function performs any clean-up operations needed by a thread + local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}). + + @param[out] key Pointer to the newly created thread local storage key value. + @param[in] destructor Pointer to the key-specific destructor function. Passing NULL + specifies that no destructor function is defined for the key. + + @return + #QURT_EOK -- Key successfully created. \n + #QURT_ETLSAVAIL -- No free TLS key available. + + @dependencies + None. + */ +int qurt_tls_create_key (int *key, void (*destructor)(void *)); + +/**@ingroup func_qurt_tls_set_specific + Stores a data item to thread local storage along with the specified key. + + @param[in] key Thread local storage key value. + @param[in] value Pointer to user data value to store. + + @return + #QURT_EOK -- Data item successfully stored. \n + #QURT_EINVALID -- Invalid key. \n + #QURT_EFAILED -- Invoked from a non-thread context. + */ +int qurt_tls_set_specific (int key, const void *value); + +/**@ingroup func_qurt_tls_get_specific + Loads the data item from thread local storage. \n + Returns the data item that is stored in thread local storage with the specified key. + The data item is always a pointer to user data. + + @param[in] key Thread local storage key value. + + @return + Pointer -- Data item indexed by key in thread local storage. \n + 0 (NULL) -- Key out of range. + + @dependencies + None. + */ +void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key); + + +/**@ingroup func_qurt_tls_delete_key + Deletes the specified key from thread local storage. + + @note1hang Explicitly deleting a key does not execute any destructor function that is + associated with the key (Section @xref{sec:tls_create_key}). + + @param[in] key Thread local storage key value to delete. + + @return + #QURT_EOK -- Key successfully deleted. \n + #QURT_ETLSENTRY -- Key already free. + + @dependencies + None. + */ +int qurt_tls_delete_key (int key); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_trace.h new file mode 100755 index 0000000000000..541f8f1d34bf6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_trace.h @@ -0,0 +1,317 @@ +#ifndef QURT_TRACE_H +#define QURT_TRACE_H +/** + @file qurt_trace.h + @brief Prototypes of system call tracing helpers API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + GLOBAL VARIABLES +=============================================================================*/ +/** @cond internal_only */ +/** @addtogroup etm_macros +@{ */ +/* ETM trace types. */ +#define QURT_ETM_TYPE_PC_ADDR (1U<<0) /**< PC address.*/ +#define QURT_ETM_TYPE_MEMORY_ADDR (1U<<1) /**< Memory address. */ +#define QURT_ETM_TYPE_TESTBUS (1U<<2) /**< Test bus. */ +#define QURT_ETM_TYPE_CYCLE_ACCURATE (1U<<3) /**< Cycle accurate. */ +#define QURT_ETM_TYPE_CYCLE_COARSE (1U<<4) /**< Cycle coarse. */ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */ +#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */ +#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */ + +/* ETM routes. */ +#define QURT_ETM_ROUTE_TO_QDSS 0U /**< ETM route to QDSS. */ +#define QURT_ETM_ROUTE_TO_Q6ETB 1U /**< ETM route to Q6ETB. */ + +/* ETM filters. */ +#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT 0U /*< Filter all as default. */ +#define QURT_ETM_TRACE_FILTER_HNUM0 (1U<<0) /*< Filter HNUM0. */ +#define QURT_ETM_TRACE_FILTER_HNUM1 (1U<<1) /*< Filter HNUM1. */ +#define QURT_ETM_TRACE_FILTER_HNUM2 (1U<<2) /*< Filter HNUM2. */ +#define QURT_ETM_TRACE_FILTER_HNUM3 (1U<<3) /*< Filter HNUM3. */ +#define QURT_ETM_TRACE_FILTER_HNUM4 (1U<<4) /*< Filter HNUM4. */ +#define QURT_ETM_TRACE_FILTER_HNUM5 (1U<<5) /*< Filter HNUM5. */ +#define QURT_ETM_TRACE_FILTER_HNUM6 (1U<<6) /*< Filter HNUM6. */ +#define QURT_ETM_TRACE_FILTER_HNUM7 (1U<<7) /*< Filter HNUM7. */ +#define QURT_ETM_TRACE_FILTER_HNUM8 (1U<<8) /*< Filter HNUM8. */ +#define QURT_ETM_TRACE_FILTER_HNUM9 (1U<<9) /*< Filter HNUM9. */ +#define QURT_ETM_TRACE_FILTER_HNUM10 (1U<<10) /*< Filter HNUM10. */ +#define QURT_ETM_TRACE_FILTER_HNUM11 (1U<<11) /*< Filter HNUM11. */ +#define QURT_ETM_TRACE_FILTER_HNUM12 (1U<<12) /*< Filter HNUM12. */ +#define QURT_ETM_TRACE_FILTER_HNUM13 (1U<<13) /*< Filter HNUM13. */ +#define QURT_ETM_TRACE_FILTER_HNUM14 (1U<<14) /*< Filter HNUM14. */ +#define QURT_ETM_TRACE_FILTER_HNUM15 (1U<<15) /*< Filter HNUM15. */ +#define QURT_ETM_TRACE_FILTER_ALL QURT_ETM_TRACE_FILTER_ALL_DEFAULT + +#define QURT_ETM_TRACE_FILTER_CLUSTER0 (1<<16) /*< Filter trace cluster0 address. */ +#define QURT_ETM_TRACE_FILTER_CLUSTER1 (1<<17) /*< Filter trace cluster1 address. */ +#define QURT_ETM_TRACE_FILTER_PC_RANGE (1<<19) /*< Filter PC address range. */ + +/* ETM memory source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< ETM memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< ETM memory source of SAC* is data. */ + +/* Period between synchronization traces */ +#define QURT_ETM_ASYNC_PERIOD 0 /**< Async.*/ +#define QURT_ETM_ISYNC_PERIOD 1 /**< Isync.*/ +#define QURT_ETM_GSYNC_PERIOD 2 /**< Gsync. */ + +/* ETM enable flags */ +#define QURT_ETM_OFF 0U /**< ETM off. */ +#define QURT_ETM_ON 1U /**< ETM on. */ +/** @endcond */ +/** @} */ /* end_addtogroup etm_macros */ + +/** @addtogroup function_tracing_macro +@{ */ +/* ETM setup return values */ +#define QURT_ETM_SETUP_OK 0 /**< ETM setup OK. */ +#define QURT_ETM_SETUP_ERR 1 /**< ETM setup error. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* ETM breakpoint types */ +#define QURT_ETM_READWRITE_BRKPT 0U /**< ETM read/write breakpoint. */ +#define QURT_ETM_READ_BRKPT 1U /**< ETM read breakpoint. */ +#define QURT_ETM_WRITE_BRKPT 2U /**< ETM write breakpoint. */ +#define QURT_ETM_BRKPT_INVALIDATE 3U /**< Invalidate breakpoint. */ +/** @addtogroup function_tracing_macro +@{ */ +/* ATB status flags */ +#define QURT_ATB_OFF 0 /**< ATB off. */ +#define QURT_ATB_ON 1 /**< ATB on. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* DTM enable flags */ +#define QURT_DTM_OFF 0 /**< DTM off. */ +#define QURT_DTM_ON 1 /**< DTM on. */ + +/** @addtogroup function_tracing_datatypes +@{ */ +/**STM trace information. */ +typedef struct qurt_stm_trace_info { + /** @cond */ + unsigned int stm_port_addr[6]; /* STM port address to which trace data must be written.*/ + unsigned int thread_event_id; /* Event ID for context switches.*/ + unsigned int interrupt_event_id; /* Event ID for interrupts. */ + unsigned int marker; /* Marker value that must be written at the beginning of the trace. */ + /** @endcond */ +} qurt_stm_trace_info_t; +/** @} */ /* end_addtogroup function_tracing_datatypes */ +/*============================================================================= + GLOBAL FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_trace_get_marker + Gets the kernel trace marker.\n + Returns the current value of the kernel trace marker. + The marker consists of a hardware thread identifier and an index into the kernel trace + buffer. The trace buffer records kernel events. + + @note1hang Using this function with qurt_trace_changed() + determines whether certain kernel events occurred in a block of code. + + @return + Integer -- Kernel trace marker. + + @dependencies + None. +*/ +unsigned int qurt_trace_get_marker(void); + +/**@ingroup func_qurt_trace_changed + Determines whether specific kernel events have occurred. \n + Returns a value that indicates whether the specified kernel events are recorded in the + kernel trace buffer since the specified kernel trace marker was obtained. + + The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling + qurt_trace_get_marker(). + @cond rest_dist For more information on the mask value, see the description of the trace_mask element in + @xhyperref{80VB41992,80-VB419-92}. \n @endcond + + @note1hang Used with qurt_trace_get_marker(), this function determines whether + certain kernel events occurred in a block of code.\n + @note1cont This function cannot determine whether a specific kernel event type has + occurred unless that event type has been enabled in the trace_mask element + of the system configuration file. \n + @note1cont QuRT supports the recording of interrupt and context switch events only (such as + a trace_mask value of 0x3). + + @param[in] prev_trace_marker Previous kernel trace marker. + @param[in] trace_mask Mask value that indicates which kernel events to check for. + + @returns + 1 -- Kernel events of the specified type have occurred since the + specified trace marker was obtained.\n + 0 -- No kernel events of the specified type have occurred since the + specified trace marker was obtained. + + @dependencies + None. +*/ +int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask); + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup function_tracing_macro +@{ */ +#ifndef QURT_DEBUG +#define QURT_TRACE(str, ...) __VA_ARGS__ + /**< Function tracing is implemented with the QURT_TRACE debug macro, which + optionally generates printf statements both before and after every function call that is + passed as a macro argument. + + For example, in the following macro calls in the source code: + @code + QURT_TRACE(myfunc, my_func(33)) + + @endcode + generates the following debug output: + @code + myfile:nnn: my_func >>> calling my_func(33) + myfile:nnn: my_func >>> returned my_func(33) + @endcode + The debug output includes the source file and line number of the function call, along with + the text of the call. Compile the client source file with -D __FILENAME__ + defined for its file name. + + The library function qurt_printf() generates the debug output. + The QURT_DEBUG symbol controls generation of the debug output. If this symbol is + not defined, function tracing is not generated.\n + @note1hang The debug macro is accessed through the QuRT API header file. + */ +#else +#define QURT_TRACE(str, ...) \ + do { \ + qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + __VA_ARGS__; \ + qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + } while (0); +#endif +/** @} */ /* end_addtogroup function_tracing_macro */ + +/**@ingroup func_qurt_etm_set_pc_range + Sets the PC address range for ETM filtering. + Depending on the Hexagon core design, a maximum of four PC ranges are supported. + + @param[in] range_num 0 to 3. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_range + Sets the address range for ETM filtering. + It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA. + + @param[in] addr_source_type Type of the address source:\n + - #QURT_ETM_SOURCE_PC \n + - #QURT_ETM_SOURCE_DATA @tablebulletend + @param[in] trig_block_num 0 to 3. + @param[in] pid pid of the process + 1. Any valid PID number will enable the ASID based trace filtering. + 2. QURT_ETM_NO_PID - Disable the ASID based trace filtering. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_atb + Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled. + QuRT performs the corresponding actions at low power management. + + @param[in] flag Values: \n + #QURT_ATB_ON \n + #QURT_ATB_OFF + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure + + @dependencies + None. +*/ +unsigned int qurt_etm_set_atb(unsigned int flag); + +/**@ingroup func_qurt_etm_set_sync_period + Sets the period for types of synchronization trace packets. \n + ASYNC defines the period between alignment synchronization packets. + Period is in terms of bytes in the packet stream. \n + ISYNC defines the period between instruction synchronization packets. + Period is per thread and is defined as the bytes sent out for that thread. \n + GSYNC is the defined period in thread cycles between GSYNC packets. + + @param[in] sync_type Type of synchronization packets: \n + #QURT_ETM_ASYNC_PERIOD \n + #QURT_ETM_ISYNC_PERIOD \n + #QURT_ETM_GSYNC_PERIOD + @param[in] period Period value. + + @return + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. + */ +unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period); + +/**@ingroup func_qurt_stm_trace_set_config + Sets up a STM port for tracing events. + + @datatypes + #qurt_stm_trace_info_t + + @param[in] stm_config_info Pointer to the STM trace information used to set up the trace + in the kernel. + The strucure must have the following:\n + - One port address per hardware thread \n + - Event ID for context switches \n + - Event ID for interrupt tracing n + - Header or marker to identify the beginning of the trace. @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table. + + @dependencies + None. + */ +unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TRACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_types.h new file mode 100755 index 0000000000000..bdb83a3fe2fb2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_types.h @@ -0,0 +1,294 @@ +#ifndef QURT_TYPES_H +#define QURT_TYPES_H +/** + @file qurt_types.h + @brief Contains types common to all configurations + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +//#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define PGA_BITFIELD_MASK(hi,lo) (((~0u)>>(31U-((hi)-(lo))))<<(lo)) +#define PGA_BITFIELD_GET(x,hi,lo) (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo)) +#define PGA_BITFIELD_INS(hi,lo,v) (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo))) +#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v))) +#define QURT_PGATTR_C_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 3U, 0U) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 5U, 4U) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_C_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v)) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v)) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_MKRAW(v) ((qurt_pgattr_t){.pga_value = (v)}) +#define QURT_PGATTR_MK(c,a) QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a))) + +/*return types for qurt_island_get_status2*/ +#define QURT_ISLAND_MODE_NORMAL 0U /**< Normal operating mode */ +#define QURT_ISLAND_MODE_ISLAND 1U /**< Island mode */ +#define QURT_ISLAND_MODE_EXITING 2U /**< In transition from Island mode to Normal mode */ + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/** @addtogroup memory_management_types +@{ */ +typedef unsigned int qurt_addr_t; /**< QuRT address type.*/ +typedef unsigned int qurt_paddr_t; /**< QuRT physical memory address type. */ +/** @cond rest_reg_dist */ +typedef unsigned long long qurt_addr_64_t; /**< QuRT 64-bit memory address type. */ +typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */ +typedef unsigned int qurt_mem_region_t; /**< QuRT memory regions type. */ +typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */ +/**@endcond */ +typedef unsigned int qurt_mem_pool_t; /**< QuRT memory pool type.*/ +typedef unsigned int qurt_size_t; /**< QuRT size type. */ +/** @cond */ +typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */ +#define QURT_PHYSPOOL_NAME_LEN (32) +typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN]; + + +/* + * Mapping type + * + * QMEM_MAPPING_VIRTUAL is the default mode, in which the system + * picks up the available range of the virtual address, and maps it to + * available contiguous physical addresses. Physical-to-virtual + * is not guaranteed to be 1:1; both virtual and physical memory is + * contiguous. + * + * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address; + * the kernel allocates 1:1 physical-to-virtual memory. Primary use of + * of this mapping is to allocate physical-to-virtual memory 1:1. + * + * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might + * not be the same as the physical address. But the physical address of the + * memory region is guaranteed to be contiguous starting at the provided + * address, it is required to provide a fixed physical address. The primary + * use of this mapping is to allocate physical memory from a particular + * address, where 1:1 physical-to-virtual is not required. + * + * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory + * area (VMA); no physical memory is reserved or mapped to this virtual + * space; all standard qmem_region APIs apply to a VMA, however physical + * address is always INVALID_ADDR. qmem_region_create() in this mode + * returns a handle to the VMA, both virt_addr and phys_addr must + * be set to INVALID_ADDR, kernel allocates any available virtual + * memory of the specified size. Obtain the starting virtual address + * of VMA through qmem_region_attr_getvirtaddr(). + * Primary purpose of this mapping mode is to provide a mechanism for + * delayed binding in QuRT, for example reserve virtual memory and map it at + * some later time to possibly discontiguous physical blocks. Thus, a + * single VMA can be partitioned among several physical-virtual mappings + * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode. + * Each VMA keeps track of associated mapped regions. + * Deletion of VMA succeeds only if all associated "virtual_fixed" + * regions are freed prior to VMA deletion. + * + * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region + * from virtual space that has been reserved via qmem_region_create() + * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if + * phys_addr is specified, the kernel attempts to map it accordingly, + * if no phys_addr is specified, kernel maps any available physical + * memory. All standard qmem_region APIs apply to such region. Remapping + * a virtual range without prior freeing of the region is not permitted. + * When such region is deleted its corresponding VMA remains intact. + * + * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous + * virtual memory but physical memory can be discontiguous. This method + * tries to club small physical memory blocks to obtain requested + * memory and is useful in case where there is no contiguous full block + * of requested size. If client does not need contiguous physical memory, + * (for example, if client does not use physical addressing), this helps + * use smaller physical memory blocks rather than using contiguous memory. + * Note: When memory is allocated through this method, physical address is + * not returned to the caller using the qurt_mem_region_attr_get() API as there might + * not be a single physical address. + * + */ +/**@endcond */ +/** QuRT memory region mapping type. */ +typedef enum { + QURT_MEM_MAPPING_VIRTUAL=0, /**< Default mode. The region virtual address range maps to an + available contiguous area of physical memory. For the most + efficient use of virtual memory, the QuRT system + chooses the base address in physical memory. This works for most memory + use cases.*/ + QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1, /**< The region virtual address space must be mapped to a + contiguous area of physical memory. This is necessary when the + memory region is accessed by external devices that bypass Hexagon + virtual memory addressing. The base address in physical + memory must be explicitly specified.*/ + QURT_MEM_MAPPING_IDEMPOTENT=2, /**< Region virtual address space maps + to the identical area of physical memory. */ + QURT_MEM_MAPPING_VIRTUAL_FIXED=3, /**< Virtual address space of the region maps either to the + specified area of physical memory or (if no area is specified) + to available physical memory. Use this mapping to create + regions from virtual space that was reserved by calling + qurt_mem_region_create() with mapping. */ + QURT_MEM_MAPPING_NONE=4, /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not + permitted without first deleting the memory region. When such a region is + deleted, its corresponding virtual memory addressing remains intact. */ + QURT_MEM_MAPPING_VIRTUAL_RANDOM=7, /**< System chooses a random virtual address and + maps it to available contiguous physical addresses.*/ + QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical + memory blocks. This helps when there are smaller contiguous blocks + than the requested size. + Physical address is not provided as part of the get_attr call */ + QURT_MEM_MAPPING_INVALID=10, /**< Reserved as an invalid mapping type. */ +} qurt_mem_mapping_t; + + +/** QuRT cache mode type. */ +typedef enum { + QURT_MEM_CACHE_WRITEBACK=7, /**< Write back. */ + QURT_MEM_CACHE_NONE_SHARED=6, /**< Normal uncached memory that can be shared with other subsystems.*/ + QURT_MEM_CACHE_WRITETHROUGH=5, /**< Write through. */ + QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0, /**< Write back non-L2-cacheable.*/ + QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1, /**< Write through non-L2-cacheable. */ + QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK, /**< Write back L2 cacheable. */ + QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH, /**< Write through L2 cacheable. */ + QURT_MEM_CACHE_DEVICE = 4, /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/ + QURT_MEM_CACHE_NONE = 4, /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */ + QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */ + QURT_MEM_CACHE_INVALID=10, /**< Reserved as an invalid cache type. */ +} qurt_mem_cache_mode_t; + +/** Memory access permission. */ +#define QURT_PERM_NONE 0x0U /**< No permission. */ +#define QURT_PERM_READ 0x1U /**< Read permission. */ +#define QURT_PERM_WRITE 0x2U /**< Write permission. */ +#define QURT_PERM_EXECUTE 0x4U /**< Execution permission. */ +#define QURT_PERM_NODUMP 0x8U + /**< Skip dumping the mapping. During process domain dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and DSP process + crashed before the mapping is removed. */ +#define QURT_PERM_FULL QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE /**< Read, write, and execute permission. */ + +typedef unsigned char qurt_perm_t; + + +/** @cond rest_reg_dist*/ +/** QuRT cache type; specifies data cache or instruction cache. */ +typedef enum { + QURT_MEM_ICACHE, /**< Instruction cache.*/ + QURT_MEM_DCACHE /**< Data cache.*/ +} qurt_mem_cache_type_t; + +/** QuRT cache operation code type. */ +typedef enum { + QURT_MEM_CACHE_FLUSH, /**< Flush. */ + QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */ + QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */ + QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */ + QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/ +} qurt_mem_cache_op_t; + +/** QuRT memory region type. */ +typedef enum { + QURT_MEM_REGION_LOCAL=0, /**< Local. */ + QURT_MEM_REGION_SHARED=1, /**< Shared.*/ + QURT_MEM_REGION_USER_ACCESS=2, /**< User access. */ + QURT_MEM_REGION_FS=4, /**< FS. */ + QURT_MEM_REGION_INVALID=10, /**< Reserved as an invalid region type. */ +} qurt_mem_region_type_t; + +/* Cache and bus attributes are combined into a value of this type for convenience, + and macros for combining and extracting fields are defined here. */ +/** @cond */ +struct qurt_pgattr { + unsigned pga_value; /**< PGA value.*/ +}; +typedef struct qurt_pgattr qurt_pgattr_t; +/** @endcond */ +/** QuRT memory region attributes type.*/ +/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr. + virtaddr cannot be specified for a memory region, it can only be queried by the + qmem_attr_getvirtaddr() function. + */ +typedef struct { + /** @cond */ + qurt_mem_mapping_t mapping_type; + unsigned char perms; + unsigned short owner; + qurt_pgattr_t pga; + unsigned ppn; //physical page number (physical>>12) + qurt_addr_t virtaddr; + qurt_mem_region_type_t type; + qurt_size_t size; + /** @endcond */ +} qurt_mem_region_attr_t; + + +/** QuRT user physical memory pool type. */ +typedef struct { + /** @cond */ + char name[32]; + struct ranges{ + unsigned int start; + unsigned int size; + } ranges[MAX_POOL_RANGES]; + /** @endcond */ +} qurt_mem_pool_attr_t; + +/** QuRT memory pool status type.*/ +typedef struct _qurt_mem_pool_status { + + qurt_size_t contig_size; /**< Largest contiguous free memory in bytes. */ + qurt_size_t free_size; /**< Total free memory in bytes. */ + qurt_size_t total_size; /**< Total declared memory in bytes. */ + +} qurt_mem_pool_status_t; + +typedef enum { + HEXAGON_L1_I_CACHE = 0, /**< Hexagon L1 instruction cache. */ + HEXAGON_L1_D_CACHE = 1, /**< Hexagon L1 data cache. */ + HEXAGON_L2_CACHE = 2 /**< Hexagon L2 cache. */ +} qurt_cache_type_t; + +typedef enum { + FULL_SIZE = 0, /**< Fully shared cache, without partitioning. */ + HALF_SIZE = 1, /**< 1/2 for main, 1/2 for auxiliary. */ + THREE_QUARTER_SIZE = 2, /**< 3/4 for main, 1/4 for auxiliary. */ + SEVEN_EIGHTHS_SIZE = 3 /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */ +} qurt_cache_partition_size_t; + +typedef enum { + QURT_PROCESS_CB_GENERIC, /**< generic unconditional cb called after image loading. */ + QURT_PROCESS_NOTE_CB_PRE_MAP, /**< note cb called before segment loading. */ + QURT_PROCESS_NOTE_CB_POST_MAP /**< note cb called after segment loading. */ +} qurt_process_cb_type_t; + +typedef union { + void *ptr; + int num; +} qurt_process_callback_arg_t; + + +/**@endcond*/ + +/** @} */ /* end_addtogroup memory_management_types */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TYPES_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_user_dma.h new file mode 100755 index 0000000000000..e05a6429fd703 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_user_dma.h @@ -0,0 +1,44 @@ +#ifndef QURT_USER_DMA_H +#define QURT_USER_DMA_H + +/** + @file qurt_user_dma.h + @brief Definitions, macros, and prototypes used for handling user DMA. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_user_dma_dmsyncht + Sends the DMSyncht command to the user DMA engine. + + Call this function to ensure all posted DMA memory operations are + complete. + + This stalls the current thread until the instruction + is complete and returns. + + @return + QURT_EOK - On dmsyncht completion \n + QURT_ENOTSUPPORTED - User DMA not supported + + @dependencies + None. +*/ +int qurt_user_dma_dmsyncht(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_vtlb.h new file mode 100755 index 0000000000000..e064042e447ac --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_vtlb.h @@ -0,0 +1,76 @@ +/*============================================================================= + + qurt_vtlb.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef QURT_VTLB_H +#define QURT_VTLB_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Names starting with "qurt_i_vtlb" are the internal low-level functions. +|| These should be considered subject to change. +*/ + +int qurt_i_vtlb_entry_create(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension); + +int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension, + unsigned target_pid); + +int qurt_i_vtlb_entry_delete(unsigned index); + +int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo); + +int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension); + +int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid); + +int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex); + +int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid); + + +int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries + // stats[1] -- number of available VTLB entries + // stats[2] -- max size of VTLB tree since boot + +//can return index to an entry that was specialed, change it to take addresses instead of pages +int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size); + +int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index); + +#define QURT_VTLB_EXT_DEFAULT 0U +#define QURT_VTLB_EXT_LOCKED 1U +#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U /* Temporary ability to skip certain mappings in pd dump */ +#define QURT_VTLB_EXT_FREELIST 0x800000u + +#define QURT_VTLB_ERR_OVERLAP -64 +#define QURT_VTLB_ERR_TREE_NO_SPACE -65 +#define QURT_VTLB_ERR_INVALID_SIZE -68 +#define QURT_VTLB_ERR_INVALID_EXT -69 +#define QURT_VTLB_ERR_DEL_PGT_LOCKED -70 +#define QURT_VTLB_ERR_PGT_LOCK_CNT -71 + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_VTLB_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libposix.a new file mode 100755 index 0000000000000..fd0c274b7ca0e Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurt.a new file mode 100755 index 0000000000000..23238a59eaa87 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurtcfs.a new file mode 100755 index 0000000000000..85f9ad9d41bce Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_island.a new file mode 100755 index 0000000000000..b4a6a40af02a8 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_island.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_main.a new file mode 100755 index 0000000000000..472857ff02a1f Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_main.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libposix.a new file mode 100755 index 0000000000000..566d5c66d3f03 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurt.a new file mode 100755 index 0000000000000..fffad1d70a51c Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurtcfs.a new file mode 100755 index 0000000000000..85f9ad9d41bce Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libtimer.a new file mode 100755 index 0000000000000..89aa8ae9e03bb Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libtimer.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/confname.h new file mode 100755 index 0000000000000..d9ca3135501e3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/confname.h @@ -0,0 +1,528 @@ +#ifndef CONFNAME_H +#define CONFNAME_H +/** + @file confname.h + @brief Named literals for 'name' argument of sysconf, pathconf + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly. Instead include unistd.h. For now since + toolchain doesnt provide a hook by including bits/confname.h, we stick this + header in QuRT's sys/types.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +/* Values for the NAME argument to `pathconf' and `fpathconf'. */ +enum +{ + _PC_LINK_MAX, +#define _PC_LINK_MAX _PC_LINK_MAX + _PC_MAX_CANON, +#define _PC_MAX_CANON _PC_MAX_CANON + _PC_MAX_INPUT, +#define _PC_MAX_INPUT _PC_MAX_INPUT + _PC_NAME_MAX, +#define _PC_NAME_MAX _PC_NAME_MAX + _PC_PATH_MAX, +#define _PC_PATH_MAX _PC_PATH_MAX + _PC_PIPE_BUF, +#define _PC_PIPE_BUF _PC_PIPE_BUF + _PC_CHOWN_RESTRICTED, +#define _PC_CHOWN_RESTRICTED _PC_CHOWN_RESTRICTED + _PC_NO_TRUNC, +#define _PC_NO_TRUNC _PC_NO_TRUNC + _PC_VDISABLE, +#define _PC_VDISABLE _PC_VDISABLE + _PC_SYNC_IO, +#define _PC_SYNC_IO _PC_SYNC_IO + _PC_ASYNC_IO, +#define _PC_ASYNC_IO _PC_ASYNC_IO + _PC_PRIO_IO, +#define _PC_PRIO_IO _PC_PRIO_IO + _PC_SOCK_MAXBUF, +#define _PC_SOCK_MAXBUF _PC_SOCK_MAXBUF + _PC_FILESIZEBITS, +#define _PC_FILESIZEBITS _PC_FILESIZEBITS + _PC_REC_INCR_XFER_SIZE, +#define _PC_REC_INCR_XFER_SIZE _PC_REC_INCR_XFER_SIZE + _PC_REC_MAX_XFER_SIZE, +#define _PC_REC_MAX_XFER_SIZE _PC_REC_MAX_XFER_SIZE + _PC_REC_MIN_XFER_SIZE, +#define _PC_REC_MIN_XFER_SIZE _PC_REC_MIN_XFER_SIZE + _PC_REC_XFER_ALIGN, +#define _PC_REC_XFER_ALIGN _PC_REC_XFER_ALIGN + _PC_ALLOC_SIZE_MIN, +#define _PC_ALLOC_SIZE_MIN _PC_ALLOC_SIZE_MIN + _PC_SYMLINK_MAX, +#define _PC_SYMLINK_MAX _PC_SYMLINK_MAX + _PC_2_SYMLINKS +#define _PC_2_SYMLINKS _PC_2_SYMLINKS +}; + +/* Values for the argument to `sysconf'. */ +enum +{ + _SC_ARG_MAX, +#define _SC_ARG_MAX _SC_ARG_MAX + _SC_CHILD_MAX, +#define _SC_CHILD_MAX _SC_CHILD_MAX + _SC_CLK_TCK, +#define _SC_CLK_TCK _SC_CLK_TCK + _SC_NGROUPS_MAX, +#define _SC_NGROUPS_MAX _SC_NGROUPS_MAX + _SC_OPEN_MAX, +#define _SC_OPEN_MAX _SC_OPEN_MAX + _SC_STREAM_MAX, +#define _SC_STREAM_MAX _SC_STREAM_MAX + _SC_TZNAME_MAX, +#define _SC_TZNAME_MAX _SC_TZNAME_MAX + _SC_JOB_CONTROL, +#define _SC_JOB_CONTROL _SC_JOB_CONTROL + _SC_SAVED_IDS, +#define _SC_SAVED_IDS _SC_SAVED_IDS + _SC_REALTIME_SIGNALS, +#define _SC_REALTIME_SIGNALS _SC_REALTIME_SIGNALS + _SC_PRIORITY_SCHEDULING, +#define _SC_PRIORITY_SCHEDULING _SC_PRIORITY_SCHEDULING + _SC_TIMERS, +#define _SC_TIMERS _SC_TIMERS + _SC_ASYNCHRONOUS_IO, +#define _SC_ASYNCHRONOUS_IO _SC_ASYNCHRONOUS_IO + _SC_PRIORITIZED_IO, +#define _SC_PRIORITIZED_IO _SC_PRIORITIZED_IO + _SC_SYNCHRONIZED_IO, +#define _SC_SYNCHRONIZED_IO _SC_SYNCHRONIZED_IO + _SC_FSYNC, +#define _SC_FSYNC _SC_FSYNC + _SC_MAPPED_FILES, +#define _SC_MAPPED_FILES _SC_MAPPED_FILES + _SC_MEMLOCK, +#define _SC_MEMLOCK _SC_MEMLOCK + _SC_MEMLOCK_RANGE, +#define _SC_MEMLOCK_RANGE _SC_MEMLOCK_RANGE + _SC_MEMORY_PROTECTION, +#define _SC_MEMORY_PROTECTION _SC_MEMORY_PROTECTION + _SC_MESSAGE_PASSING, +#define _SC_MESSAGE_PASSING _SC_MESSAGE_PASSING + _SC_SEMAPHORES, +#define _SC_SEMAPHORES _SC_SEMAPHORES + _SC_SHARED_MEMORY_OBJECTS, +#define _SC_SHARED_MEMORY_OBJECTS _SC_SHARED_MEMORY_OBJECTS + _SC_AIO_LISTIO_MAX, +#define _SC_AIO_LISTIO_MAX _SC_AIO_LISTIO_MAX + _SC_AIO_MAX, +#define _SC_AIO_MAX _SC_AIO_MAX + _SC_AIO_PRIO_DELTA_MAX, +#define _SC_AIO_PRIO_DELTA_MAX _SC_AIO_PRIO_DELTA_MAX + _SC_DELAYTIMER_MAX, +#define _SC_DELAYTIMER_MAX _SC_DELAYTIMER_MAX + _SC_MQ_OPEN_MAX, +#define _SC_MQ_OPEN_MAX _SC_MQ_OPEN_MAX + _SC_MQ_PRIO_MAX, +#define _SC_MQ_PRIO_MAX _SC_MQ_PRIO_MAX + _SC_VERSION, +#define _SC_VERSION _SC_VERSION + _SC_PAGESIZE, +#define _SC_PAGESIZE _SC_PAGESIZE +#define _SC_PAGE_SIZE _SC_PAGESIZE + _SC_RTSIG_MAX, +#define _SC_RTSIG_MAX _SC_RTSIG_MAX + _SC_SEM_NSEMS_MAX, +#define _SC_SEM_NSEMS_MAX _SC_SEM_NSEMS_MAX + _SC_SEM_VALUE_MAX, +#define _SC_SEM_VALUE_MAX _SC_SEM_VALUE_MAX + _SC_SIGQUEUE_MAX, +#define _SC_SIGQUEUE_MAX _SC_SIGQUEUE_MAX + _SC_TIMER_MAX, +#define _SC_TIMER_MAX _SC_TIMER_MAX + + /* Values for the argument to `sysconf' + corresponding to _POSIX2_* symbols. */ + _SC_BC_BASE_MAX, +#define _SC_BC_BASE_MAX _SC_BC_BASE_MAX + _SC_BC_DIM_MAX, +#define _SC_BC_DIM_MAX _SC_BC_DIM_MAX + _SC_BC_SCALE_MAX, +#define _SC_BC_SCALE_MAX _SC_BC_SCALE_MAX + _SC_BC_STRING_MAX, +#define _SC_BC_STRING_MAX _SC_BC_STRING_MAX + _SC_COLL_WEIGHTS_MAX, +#define _SC_COLL_WEIGHTS_MAX _SC_COLL_WEIGHTS_MAX + _SC_EQUIV_CLASS_MAX, +#define _SC_EQUIV_CLASS_MAX _SC_EQUIV_CLASS_MAX + _SC_EXPR_NEST_MAX, +#define _SC_EXPR_NEST_MAX _SC_EXPR_NEST_MAX + _SC_LINE_MAX, +#define _SC_LINE_MAX _SC_LINE_MAX + _SC_RE_DUP_MAX, +#define _SC_RE_DUP_MAX _SC_RE_DUP_MAX + _SC_CHARCLASS_NAME_MAX, +#define _SC_CHARCLASS_NAME_MAX _SC_CHARCLASS_NAME_MAX + + _SC_2_VERSION, +#define _SC_2_VERSION _SC_2_VERSION + _SC_2_C_BIND, +#define _SC_2_C_BIND _SC_2_C_BIND + _SC_2_C_DEV, +#define _SC_2_C_DEV _SC_2_C_DEV + _SC_2_FORT_DEV, +#define _SC_2_FORT_DEV _SC_2_FORT_DEV + _SC_2_FORT_RUN, +#define _SC_2_FORT_RUN _SC_2_FORT_RUN + _SC_2_SW_DEV, +#define _SC_2_SW_DEV _SC_2_SW_DEV + _SC_2_LOCALEDEF, +#define _SC_2_LOCALEDEF _SC_2_LOCALEDEF + + _SC_PII, +#define _SC_PII _SC_PII + _SC_PII_XTI, +#define _SC_PII_XTI _SC_PII_XTI + _SC_PII_SOCKET, +#define _SC_PII_SOCKET _SC_PII_SOCKET + _SC_PII_INTERNET, +#define _SC_PII_INTERNET _SC_PII_INTERNET + _SC_PII_OSI, +#define _SC_PII_OSI _SC_PII_OSI + _SC_POLL, +#define _SC_POLL _SC_POLL + _SC_SELECT, +#define _SC_SELECT _SC_SELECT + _SC_UIO_MAXIOV, +#define _SC_UIO_MAXIOV _SC_UIO_MAXIOV + _SC_IOV_MAX = _SC_UIO_MAXIOV, +#define _SC_IOV_MAX _SC_IOV_MAX + _SC_PII_INTERNET_STREAM, +#define _SC_PII_INTERNET_STREAM _SC_PII_INTERNET_STREAM + _SC_PII_INTERNET_DGRAM, +#define _SC_PII_INTERNET_DGRAM _SC_PII_INTERNET_DGRAM + _SC_PII_OSI_COTS, +#define _SC_PII_OSI_COTS _SC_PII_OSI_COTS + _SC_PII_OSI_CLTS, +#define _SC_PII_OSI_CLTS _SC_PII_OSI_CLTS + _SC_PII_OSI_M, +#define _SC_PII_OSI_M _SC_PII_OSI_M + _SC_T_IOV_MAX, +#define _SC_T_IOV_MAX _SC_T_IOV_MAX + + /* Values according to POSIX 1003.1c (POSIX threads). */ + _SC_THREADS, +#define _SC_THREADS _SC_THREADS + _SC_THREAD_SAFE_FUNCTIONS, +#define _SC_THREAD_SAFE_FUNCTIONS _SC_THREAD_SAFE_FUNCTIONS + _SC_GETGR_R_SIZE_MAX, +#define _SC_GETGR_R_SIZE_MAX _SC_GETGR_R_SIZE_MAX + _SC_GETPW_R_SIZE_MAX, +#define _SC_GETPW_R_SIZE_MAX _SC_GETPW_R_SIZE_MAX + _SC_LOGIN_NAME_MAX, +#define _SC_LOGIN_NAME_MAX _SC_LOGIN_NAME_MAX + _SC_TTY_NAME_MAX, +#define _SC_TTY_NAME_MAX _SC_TTY_NAME_MAX + _SC_THREAD_DESTRUCTOR_ITERATIONS, +#define _SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS + _SC_THREAD_KEYS_MAX, +#define _SC_THREAD_KEYS_MAX _SC_THREAD_KEYS_MAX + _SC_THREAD_STACK_MIN, +#define _SC_THREAD_STACK_MIN _SC_THREAD_STACK_MIN + _SC_THREAD_THREADS_MAX, +#define _SC_THREAD_THREADS_MAX _SC_THREAD_THREADS_MAX + _SC_THREAD_ATTR_STACKADDR, +#define _SC_THREAD_ATTR_STACKADDR _SC_THREAD_ATTR_STACKADDR + _SC_THREAD_ATTR_STACKSIZE, +#define _SC_THREAD_ATTR_STACKSIZE _SC_THREAD_ATTR_STACKSIZE + _SC_THREAD_PRIORITY_SCHEDULING, +#define _SC_THREAD_PRIORITY_SCHEDULING _SC_THREAD_PRIORITY_SCHEDULING + _SC_THREAD_PRIO_INHERIT, +#define _SC_THREAD_PRIO_INHERIT _SC_THREAD_PRIO_INHERIT + _SC_THREAD_PRIO_PROTECT, +#define _SC_THREAD_PRIO_PROTECT _SC_THREAD_PRIO_PROTECT + _SC_THREAD_PROCESS_SHARED, +#define _SC_THREAD_PROCESS_SHARED _SC_THREAD_PROCESS_SHARED + + _SC_NPROCESSORS_CONF, +#define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_CONF + _SC_NPROCESSORS_ONLN, +#define _SC_NPROCESSORS_ONLN _SC_NPROCESSORS_ONLN + _SC_PHYS_PAGES, +#define _SC_PHYS_PAGES _SC_PHYS_PAGES + _SC_AVPHYS_PAGES, +#define _SC_AVPHYS_PAGES _SC_AVPHYS_PAGES + _SC_ATEXIT_MAX, +#define _SC_ATEXIT_MAX _SC_ATEXIT_MAX + _SC_PASS_MAX, +#define _SC_PASS_MAX _SC_PASS_MAX + + _SC_XOPEN_VERSION, +#define _SC_XOPEN_VERSION _SC_XOPEN_VERSION + _SC_XOPEN_XCU_VERSION, +#define _SC_XOPEN_XCU_VERSION _SC_XOPEN_XCU_VERSION + _SC_XOPEN_UNIX, +#define _SC_XOPEN_UNIX _SC_XOPEN_UNIX + _SC_XOPEN_CRYPT, +#define _SC_XOPEN_CRYPT _SC_XOPEN_CRYPT + _SC_XOPEN_ENH_I18N, +#define _SC_XOPEN_ENH_I18N _SC_XOPEN_ENH_I18N + _SC_XOPEN_SHM, +#define _SC_XOPEN_SHM _SC_XOPEN_SHM + + _SC_2_CHAR_TERM, +#define _SC_2_CHAR_TERM _SC_2_CHAR_TERM + _SC_2_C_VERSION, +#define _SC_2_C_VERSION _SC_2_C_VERSION + _SC_2_UPE, +#define _SC_2_UPE _SC_2_UPE + + _SC_XOPEN_XPG2, +#define _SC_XOPEN_XPG2 _SC_XOPEN_XPG2 + _SC_XOPEN_XPG3, +#define _SC_XOPEN_XPG3 _SC_XOPEN_XPG3 + _SC_XOPEN_XPG4, +#define _SC_XOPEN_XPG4 _SC_XOPEN_XPG4 + + _SC_CHAR_BIT, +#define _SC_CHAR_BIT _SC_CHAR_BIT + _SC_CHAR_MAX, +#define _SC_CHAR_MAX _SC_CHAR_MAX + _SC_CHAR_MIN, +#define _SC_CHAR_MIN _SC_CHAR_MIN + _SC_INT_MAX, +#define _SC_INT_MAX _SC_INT_MAX + _SC_INT_MIN, +#define _SC_INT_MIN _SC_INT_MIN + _SC_LONG_BIT, +#define _SC_LONG_BIT _SC_LONG_BIT + _SC_WORD_BIT, +#define _SC_WORD_BIT _SC_WORD_BIT + _SC_MB_LEN_MAX, +#define _SC_MB_LEN_MAX _SC_MB_LEN_MAX + _SC_NZERO, +#define _SC_NZERO _SC_NZERO + _SC_SSIZE_MAX, +#define _SC_SSIZE_MAX _SC_SSIZE_MAX + _SC_SCHAR_MAX, +#define _SC_SCHAR_MAX _SC_SCHAR_MAX + _SC_SCHAR_MIN, +#define _SC_SCHAR_MIN _SC_SCHAR_MIN + _SC_SHRT_MAX, +#define _SC_SHRT_MAX _SC_SHRT_MAX + _SC_SHRT_MIN, +#define _SC_SHRT_MIN _SC_SHRT_MIN + _SC_UCHAR_MAX, +#define _SC_UCHAR_MAX _SC_UCHAR_MAX + _SC_UINT_MAX, +#define _SC_UINT_MAX _SC_UINT_MAX + _SC_ULONG_MAX, +#define _SC_ULONG_MAX _SC_ULONG_MAX + _SC_USHRT_MAX, +#define _SC_USHRT_MAX _SC_USHRT_MAX + + _SC_NL_ARGMAX, +#define _SC_NL_ARGMAX _SC_NL_ARGMAX + _SC_NL_LANGMAX, +#define _SC_NL_LANGMAX _SC_NL_LANGMAX + _SC_NL_MSGMAX, +#define _SC_NL_MSGMAX _SC_NL_MSGMAX + _SC_NL_NMAX, +#define _SC_NL_NMAX _SC_NL_NMAX + _SC_NL_SETMAX, +#define _SC_NL_SETMAX _SC_NL_SETMAX + _SC_NL_TEXTMAX, +#define _SC_NL_TEXTMAX _SC_NL_TEXTMAX + + _SC_XBS5_ILP32_OFF32, +#define _SC_XBS5_ILP32_OFF32 _SC_XBS5_ILP32_OFF32 + _SC_XBS5_ILP32_OFFBIG, +#define _SC_XBS5_ILP32_OFFBIG _SC_XBS5_ILP32_OFFBIG + _SC_XBS5_LP64_OFF64, +#define _SC_XBS5_LP64_OFF64 _SC_XBS5_LP64_OFF64 + _SC_XBS5_LPBIG_OFFBIG, +#define _SC_XBS5_LPBIG_OFFBIG _SC_XBS5_LPBIG_OFFBIG + + _SC_XOPEN_LEGACY, +#define _SC_XOPEN_LEGACY _SC_XOPEN_LEGACY + _SC_XOPEN_REALTIME, +#define _SC_XOPEN_REALTIME _SC_XOPEN_REALTIME + _SC_XOPEN_REALTIME_THREADS, +#define _SC_XOPEN_REALTIME_THREADS _SC_XOPEN_REALTIME_THREADS + + _SC_ADVISORY_INFO, +#define _SC_ADVISORY_INFO _SC_ADVISORY_INFO + _SC_BARRIERS, +#define _SC_BARRIERS _SC_BARRIERS + _SC_BASE, +#define _SC_BASE _SC_BASE + _SC_C_LANG_SUPPORT, +#define _SC_C_LANG_SUPPORT _SC_C_LANG_SUPPORT + _SC_C_LANG_SUPPORT_R, +#define _SC_C_LANG_SUPPORT_R _SC_C_LANG_SUPPORT_R + _SC_CLOCK_SELECTION, +#define _SC_CLOCK_SELECTION _SC_CLOCK_SELECTION + _SC_CPUTIME, +#define _SC_CPUTIME _SC_CPUTIME + _SC_THREAD_CPUTIME, +#define _SC_THREAD_CPUTIME _SC_THREAD_CPUTIME + _SC_DEVICE_IO, +#define _SC_DEVICE_IO _SC_DEVICE_IO + _SC_DEVICE_SPECIFIC, +#define _SC_DEVICE_SPECIFIC _SC_DEVICE_SPECIFIC + _SC_DEVICE_SPECIFIC_R, +#define _SC_DEVICE_SPECIFIC_R _SC_DEVICE_SPECIFIC_R + _SC_FD_MGMT, +#define _SC_FD_MGMT _SC_FD_MGMT + _SC_FIFO, +#define _SC_FIFO _SC_FIFO + _SC_PIPE, +#define _SC_PIPE _SC_PIPE + _SC_FILE_ATTRIBUTES, +#define _SC_FILE_ATTRIBUTES _SC_FILE_ATTRIBUTES + _SC_FILE_LOCKING, +#define _SC_FILE_LOCKING _SC_FILE_LOCKING + _SC_FILE_SYSTEM, +#define _SC_FILE_SYSTEM _SC_FILE_SYSTEM + _SC_MONOTONIC_CLOCK, +#define _SC_MONOTONIC_CLOCK _SC_MONOTONIC_CLOCK + _SC_MULTI_PROCESS, +#define _SC_MULTI_PROCESS _SC_MULTI_PROCESS + _SC_SINGLE_PROCESS, +#define _SC_SINGLE_PROCESS _SC_SINGLE_PROCESS + _SC_NETWORKING, +#define _SC_NETWORKING _SC_NETWORKING + _SC_READER_WRITER_LOCKS, +#define _SC_READER_WRITER_LOCKS _SC_READER_WRITER_LOCKS + _SC_SPIN_LOCKS, +#define _SC_SPIN_LOCKS _SC_SPIN_LOCKS + _SC_REGEXP, +#define _SC_REGEXP _SC_REGEXP + _SC_REGEX_VERSION, +#define _SC_REGEX_VERSION _SC_REGEX_VERSION + _SC_SHELL, +#define _SC_SHELL _SC_SHELL + _SC_SIGNALS, +#define _SC_SIGNALS _SC_SIGNALS + _SC_SPAWN, +#define _SC_SPAWN _SC_SPAWN + _SC_SPORADIC_SERVER, +#define _SC_SPORADIC_SERVER _SC_SPORADIC_SERVER + _SC_THREAD_SPORADIC_SERVER, +#define _SC_THREAD_SPORADIC_SERVER _SC_THREAD_SPORADIC_SERVER + _SC_SYSTEM_DATABASE, +#define _SC_SYSTEM_DATABASE _SC_SYSTEM_DATABASE + _SC_SYSTEM_DATABASE_R, +#define _SC_SYSTEM_DATABASE_R _SC_SYSTEM_DATABASE_R + _SC_TIMEOUTS, +#define _SC_TIMEOUTS _SC_TIMEOUTS + _SC_TYPED_MEMORY_OBJECTS, +#define _SC_TYPED_MEMORY_OBJECTS _SC_TYPED_MEMORY_OBJECTS + _SC_USER_GROUPS, +#define _SC_USER_GROUPS _SC_USER_GROUPS + _SC_USER_GROUPS_R, +#define _SC_USER_GROUPS_R _SC_USER_GROUPS_R + _SC_2_PBS, +#define _SC_2_PBS _SC_2_PBS + _SC_2_PBS_ACCOUNTING, +#define _SC_2_PBS_ACCOUNTING _SC_2_PBS_ACCOUNTING + _SC_2_PBS_LOCATE, +#define _SC_2_PBS_LOCATE _SC_2_PBS_LOCATE + _SC_2_PBS_MESSAGE, +#define _SC_2_PBS_MESSAGE _SC_2_PBS_MESSAGE + _SC_2_PBS_TRACK, +#define _SC_2_PBS_TRACK _SC_2_PBS_TRACK + _SC_SYMLOOP_MAX, +#define _SC_SYMLOOP_MAX _SC_SYMLOOP_MAX + _SC_STREAMS, +#define _SC_STREAMS _SC_STREAMS + _SC_2_PBS_CHECKPOINT, +#define _SC_2_PBS_CHECKPOINT _SC_2_PBS_CHECKPOINT + + _SC_V6_ILP32_OFF32, +#define _SC_V6_ILP32_OFF32 _SC_V6_ILP32_OFF32 + _SC_V6_ILP32_OFFBIG, +#define _SC_V6_ILP32_OFFBIG _SC_V6_ILP32_OFFBIG + _SC_V6_LP64_OFF64, +#define _SC_V6_LP64_OFF64 _SC_V6_LP64_OFF64 + _SC_V6_LPBIG_OFFBIG, +#define _SC_V6_LPBIG_OFFBIG _SC_V6_LPBIG_OFFBIG + + _SC_HOST_NAME_MAX, +#define _SC_HOST_NAME_MAX _SC_HOST_NAME_MAX + _SC_TRACE, +#define _SC_TRACE _SC_TRACE + _SC_TRACE_EVENT_FILTER, +#define _SC_TRACE_EVENT_FILTER _SC_TRACE_EVENT_FILTER + _SC_TRACE_INHERIT, +#define _SC_TRACE_INHERIT _SC_TRACE_INHERIT + _SC_TRACE_LOG, +#define _SC_TRACE_LOG _SC_TRACE_LOG + + _SC_LEVEL1_ICACHE_SIZE, +#define _SC_LEVEL1_ICACHE_SIZE _SC_LEVEL1_ICACHE_SIZE + _SC_LEVEL1_ICACHE_ASSOC, +#define _SC_LEVEL1_ICACHE_ASSOC _SC_LEVEL1_ICACHE_ASSOC + _SC_LEVEL1_ICACHE_LINESIZE, +#define _SC_LEVEL1_ICACHE_LINESIZE _SC_LEVEL1_ICACHE_LINESIZE + _SC_LEVEL1_DCACHE_SIZE, +#define _SC_LEVEL1_DCACHE_SIZE _SC_LEVEL1_DCACHE_SIZE + _SC_LEVEL1_DCACHE_ASSOC, +#define _SC_LEVEL1_DCACHE_ASSOC _SC_LEVEL1_DCACHE_ASSOC + _SC_LEVEL1_DCACHE_LINESIZE, +#define _SC_LEVEL1_DCACHE_LINESIZE _SC_LEVEL1_DCACHE_LINESIZE + _SC_LEVEL2_CACHE_SIZE, +#define _SC_LEVEL2_CACHE_SIZE _SC_LEVEL2_CACHE_SIZE + _SC_LEVEL2_CACHE_ASSOC, +#define _SC_LEVEL2_CACHE_ASSOC _SC_LEVEL2_CACHE_ASSOC + _SC_LEVEL2_CACHE_LINESIZE, +#define _SC_LEVEL2_CACHE_LINESIZE _SC_LEVEL2_CACHE_LINESIZE + _SC_LEVEL3_CACHE_SIZE, +#define _SC_LEVEL3_CACHE_SIZE _SC_LEVEL3_CACHE_SIZE + _SC_LEVEL3_CACHE_ASSOC, +#define _SC_LEVEL3_CACHE_ASSOC _SC_LEVEL3_CACHE_ASSOC + _SC_LEVEL3_CACHE_LINESIZE, +#define _SC_LEVEL3_CACHE_LINESIZE _SC_LEVEL3_CACHE_LINESIZE + _SC_LEVEL4_CACHE_SIZE, +#define _SC_LEVEL4_CACHE_SIZE _SC_LEVEL4_CACHE_SIZE + _SC_LEVEL4_CACHE_ASSOC, +#define _SC_LEVEL4_CACHE_ASSOC _SC_LEVEL4_CACHE_ASSOC + _SC_LEVEL4_CACHE_LINESIZE, +#define _SC_LEVEL4_CACHE_LINESIZE _SC_LEVEL4_CACHE_LINESIZE + /* Leave room here, maybe we need a few more cache levels some day. */ + + _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50, +#define _SC_IPV6 _SC_IPV6 + _SC_RAW_SOCKETS, +#define _SC_RAW_SOCKETS _SC_RAW_SOCKETS + + _SC_V7_ILP32_OFF32, +#define _SC_V7_ILP32_OFF32 _SC_V7_ILP32_OFF32 + _SC_V7_ILP32_OFFBIG, +#define _SC_V7_ILP32_OFFBIG _SC_V7_ILP32_OFFBIG + _SC_V7_LP64_OFF64, +#define _SC_V7_LP64_OFF64 _SC_V7_LP64_OFF64 + _SC_V7_LPBIG_OFFBIG, +#define _SC_V7_LPBIG_OFFBIG _SC_V7_LPBIG_OFFBIG + + _SC_SS_REPL_MAX, +#define _SC_SS_REPL_MAX _SC_SS_REPL_MAX + + _SC_TRACE_EVENT_NAME_MAX, +#define _SC_TRACE_EVENT_NAME_MAX _SC_TRACE_EVENT_NAME_MAX + _SC_TRACE_NAME_MAX, +#define _SC_TRACE_NAME_MAX _SC_TRACE_NAME_MAX + _SC_TRACE_SYS_MAX, +#define _SC_TRACE_SYS_MAX _SC_TRACE_SYS_MAX + _SC_TRACE_USER_EVENT_MAX, +#define _SC_TRACE_USER_EVENT_MAX _SC_TRACE_USER_EVENT_MAX + + _SC_XOPEN_STREAMS, +#define _SC_XOPEN_STREAMS _SC_XOPEN_STREAMS + + _SC_THREAD_ROBUST_PRIO_INHERIT, +#define _SC_THREAD_ROBUST_PRIO_INHERIT _SC_THREAD_ROBUST_PRIO_INHERIT + _SC_THREAD_ROBUST_PRIO_PROTECT +#define _SC_THREAD_ROBUST_PRIO_PROTECT _SC_THREAD_ROBUST_PRIO_PROTECT + +}; +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/posix1_lim.h new file mode 100755 index 0000000000000..0739958c5a6c4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/posix1_lim.h @@ -0,0 +1,34 @@ +#ifndef POSIX1_LIM_H +#define POSIX1_LIM_H +/** + @file posix1_lim.h + @brief POSIX Minimum values + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +TODO + This header should be ideally relocated under api/posix/bits (something that + doesnt exist today) and be included from api/posix/bits/limits.h which inturn + should be included from toolchain's limits.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +#ifndef _POSIX_PATH_MAX +/** @brief Maximum number of bytes in a pathname, including the terminating + nul character */ +#define _POSIX_PATH_MAX 256 +#endif + +#ifndef _POSIX_SEM_NSEMS_MAX +/** @brief Maximum number of semaphores that a process may have */ +#define _POSIX_SEM_NSEMS_MAX 16 +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/common/time.h new file mode 100755 index 0000000000000..76b0d39ab7039 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/common/time.h @@ -0,0 +1 @@ +#include \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/fcntl.h new file mode 100755 index 0000000000000..c80ec98a449b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/fcntl.h @@ -0,0 +1,51 @@ +#ifndef _FCNTL_H +#define _FCNTL_H + +/*========================================================================== + * FILE: fcntl.h + * + * SERVICES: POSIX fcntl.h + * + * DESCRIPTION: The header is needed by the open() and fcntl() + * system calls, which have a variety of parameters and + * flags. They are described here. + * + * The formats of the calls to each of these are: + * + * open(path, oflag [,mode]) open a file + * fcntl(fd, cmd [,arg]) get or set file attributes + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Oflag values for open(). POSIX Table 6-4. */ +#define POSIX_O_CREAT 0x100 /* creat file if it doesn't exist */ +#define POSIX_O_EXCL 0x200 /* exclusive use flag */ +#define POSIX_O_NOCTTY 0x400 /* do not assign a controlling terminal */ +#define POSIX_O_TRUNC 0x1000 /* truncate flag */ + +/* File status flags for open() and fcntl(). POSIX Table 6-5. */ +#define POSIX_O_APPEND 0x2000 /* set append mode */ +#define POSIX_O_NONBLOCK 0x4000 /* no delay */ + +/* File access modes for open() and fcntl(). POSIX Table 6-6. */ +#define POSIX_O_RDONLY 0 /* open(name, POSIX_O_RDONLY) opens read only */ +#define POSIX_O_WRONLY 1 /* open(name, POSIX_O_WRONLY) opens write only */ +#define POSIX_O_RDWR 2 /* open(name, POSIX_O_RDWR) opens read/write */ + +/* Mask for use with file access modes. POSIX Table 6-7. */ +#define POSIX_O_ACCMODE 0x3 /* mask for file access modes */ + +#ifdef __cplusplus +} +#endif + +#endif /* _FCNTL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/hooks/unistd.h new file mode 100755 index 0000000000000..1c618bfe36b4f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/hooks/unistd.h @@ -0,0 +1,115 @@ +#ifndef UNISTD_H +#define UNISTD_H +/** + @file posix/hooks/unistd.h + @brief POSIX related declarations in that are missing in toolchain + header + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly! Instead include unistd.h. + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include /* For various POSIX ID types from toolchain headers */ + +#ifdef __cplusplus +extern "C" { +#endif +extern long pathconf (char const * path, int name); + +/* Process*/ + +/** The getppid() function shall return the parent process ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the parent process ID + */ +pid_t getppid(void); + +/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid + * Please refer to POSIX standard for details. + * @param thread [in] process ID + * @param value_ptr [out] process group ID + */ +pid_t getpgid(pid_t pid); + +/** The getpgrp() function shall return the process group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] process group ID of the calling process + */ +pid_t getpgrp(void); + +/**The getuid() function shall return the real user ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the real user ID of the calling process. + */ +uid_t getuid(void); + +/** The geteuid() function shall return the effective user ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective user ID of the calling process + */ +uid_t geteuid(void); + +/** The getegid() function shall return the effective group ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective group ID of the calling process. + */ +gid_t getegid(void); + +/** The getgid() function shall return the real group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] real group ID of the calling process. + */ + gid_t getgid(void); + +/** seteuid set effective user ID + * Please refer to POSIX standard for details. + * @param thread [in] effective user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int seteuid(uid_t uid); + +/** setpgrp - set the process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setpgrp(void); + +/** setuid - set user ID + * Please refer to POSIX standard for details. + * @param thread [in] user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setuid(uid_t uid); + +/** setpgid - set process group ID for job control + * Please refer to POSIX standard for details. + * @param thread [in] PID of process, PGID to be set + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setpgid(pid_t pid, pid_t pgid); + +/** setsid - create session and set process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setsid(void); + +#ifdef __cplusplus +} +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/mqueue.h new file mode 100755 index 0000000000000..74dcc2fa202c6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/mqueue.h @@ -0,0 +1,203 @@ +#ifndef _POSIX_MQUEUE_H_ +#define _POSIX_MQUEUE_H_ + +/*========================================================================== + * FILE: mqueue.h + * + * SERVICES: POSIX Message Queue API interface + * + * DESCRIPTION: POSIX Message Queue API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technlogies, Inc. + *==========================================================================*/ + +#include /*ssize_t */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MQ_PRIO_MAX 255 /* max priority */ +#define MQ_PRIO_DEFAULT 0 /* default priority */ + +typedef int mqd_t; + +struct mq_attr +{ + long mq_flags; /* message queue flags */ + long mq_maxmsg; /* maximum number of messages */ + long mq_msgsize; /* maximum message size */ + long mq_curmsgs; /* number of messages currently queued */ +}; + +typedef struct mq_attr mqueue_attr; + +/** \details + * This provides POSIX Message Queue API. + * + * mq_notify is not supported. + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * it only supports Message sending and receiving within one process. + * Message sending and receiving among processes are not supported. + */ + +/** \defgroup mqueue POSIX Message Queue API */ +/** \ingroup mqueue */ +/** @{ */ + +/** Open a message queue. + * Please refer to POSIX standard for details. + */ +mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...); + +/** Close a message queue. + * Please refer to POSIX standard for details. + */ +int mq_close(mqd_t mq_desc); + +/** Remove a message queue. + * Please refer to POSIX standard for details. + */ +int mq_unlink(const char *name); + +/** Send a message to a message queue. + * Please refer to POSIX standard for details. + * + * If the queue is full, instead of blocking the sender, this function + * will return -1 with errno EAGAIN, in this implementation. This behavior + * may change in the future. + */ +int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio); + +/** Send a message to a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); + +/** Receive a message from a message queue. + * Please refer to POSIX standard for details. + */ +ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio); + +/** Receive a message from a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout); + +/** Get message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat); + +/** Set message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat); + +/** @} */ + +#define NBBY 8U /* number of bits in a byte */ + +/* + * Select uses bit masks of file descriptors in longs. These macros + * manipulate such bit fields (the filesystem macros use chars). + * FD_SETSIZE may be defined by the user, but the default here should + * be enough for most uses. + */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 256U +#endif + +typedef unsigned long fd_mask; +#define NFDBITS (sizeof(fd_mask) * (unsigned int)NBBY) /* bits per mask */ + +#ifndef howmany +#define howmany(x, y) (((x) + ((y) - 1U)) / (y)) +#endif + +//equivalent of fd_set fpr WINNT env +typedef struct fd_set +{ + fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)]; +} fd_set; + +/** \addtogroup mqueue */ +/** @{ */ + +/** Sets the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Clears the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise. + */ +#define FD_ISSET(n, p) ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS))) + +/** Copies the file descriptor set. + */ +#define FD_COPY(f, t) (void)(memcpy)((t), (f), sizeof(*(f))) + +/** Initializes the file descriptor set fdset to have zero bits for all file descriptors. + */ +#define FD_ZERO(p) (void)memset((p), 0, sizeof(*(p))) + +/** Error check the file descriptor set. + */ +#define FD_BAD(fd) ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/) + +/*! Wait for both message queues and signals. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int pselect(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + const struct timespec *restrict timeout, + const sigset_t *restrict sigmask); + +/*! Wait for multiple message queues. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int select(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + struct timeval *restrict timeout); + +/** @} */ + +/* this function is needed for test framework which needs to clean up memory when teardown */ +void _mq_teardown(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread.h new file mode 100755 index 0000000000000..f64242e8dc683 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread.h @@ -0,0 +1,287 @@ +#ifndef QURT_PTHREAD_H +#define QURT_PTHREAD_H + +/*========================================================================== + * FILE: pthread.h + * + * SERVICES: POSIX pthread API interface + * + * DESCRIPTION: POSIX pthread API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016,2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *========================================================================== + * + * EDIT HISTORY FOR MODULE + * + * This section contains comments describing changes made to the module. + * Notice that changes are listed in reverse chronological order. + * + * + * + * when who what, where, why + * -------- --- ------------------------------------------------------- + * 10/13/08 cz Initial version. + *==========================================================================*/ + +#include +#include "sys/sched.h" /* For struct sched_param */ +#include "sys/errno.h" /* error values */ +#include +#include +#include +#include +#include +#include "pthread_types.h" +#ifdef __cplusplus +extern "C" { +#endif + +/* the range of the set supported by the kernel data type used to represent CPU sets. */ +#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL + +#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS) static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); } + +/** @brief Magic (non-portable) value for a stack's address to enable usage + of auto-stack feature (if available) */ +#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF) + +/** \details + * This provides POSIX thread API. + * + */ + +/** \defgroup pthread POSIX pthread API */ +/** \ingroup pthread */ +/** @{ */ + +/** Compare Two Threads. + * Please refer to POSIX standard for details. + */ +static inline int pthread_equal(pthread_t t1, pthread_t t2) +{ + return (t1 == t2) ? 1 : 0; +} + +/** Create Thread. + * Please refer to POSIX standard for details. + */ +int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg); + +/** Terminate Calling Thread. + * Please refer to POSIX standard for details. + */ +void pthread_exit(void *value_ptr); + +/** Wait for thread termination. + * Please refer to POSIX standard for details. + * @param thread [in] the thread to be joined + * @param value_ptr [out] the pointer of the exit status + */ +int pthread_join(pthread_t thread, void **value_ptr); + +/** Detach a joinable thread. + * Please refer to POSIX standard for details. + * @param id [in] id of the tread the thread to be detached. + */ +int pthread_detach(pthread_t id); + +/** Dynamic package initialisation + * Please refer to POSIX standard for details. + */ +int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)); + +pthread_t pthread_self(void); +int pthread_cancel(pthread_t thread); +static inline void pthread_yield(void) +{ + return; +} + +int pthread_kill(pthread_t thread, int sig); + +/** + * @brief Return name of thread + * @warning Donot call this in the error handling path as it may cause deadlock + * due to underlying OS calls + * @param thread [in] thread Thread whose name is to be retrieved + * @param name [out] name Buffer used to return thread name + * @param len [in] len Number of bytes available in name + * @return 0 on success, ESRCH, ERANGE on failure + */ +extern int pthread_getname_np (pthread_t thread, char * name, size_t len); + +int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param); +int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param); +int pthread_setschedprio(pthread_t thread, int prio); +int pthread_setcancelstate(int state, int *oldstate); +int pthread_setcanceltype(int type, int *oldtype); + +/* Attribute functions */ +int pthread_attr_init(pthread_attr_t *attr); +int pthread_attr_destroy(pthread_attr_t *attr); +int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param); +int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param); +int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize); +int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize); +int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr); +int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr); +int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate); +int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate); +int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize); +int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize); +int pthread_attr_setscope(pthread_attr_t *attr, int scope); +int pthread_attr_getscope(const pthread_attr_t *attr, int *scope); +int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched); +int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched); +int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize); +int pthread_attr_setautostack(pthread_attr_t *attr); +int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority); + +/* Qualcomm additions to pthread get/set attribute functions */ +int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name); +int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size); +int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid); +int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid); + +/* Mutexes */ +int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr); +int pthread_mutex_lock(pthread_mutex_t *mutex); +int pthread_mutex_unlock(pthread_mutex_t *mutex); +int pthread_mutex_trylock(pthread_mutex_t *mutex); +int pthread_mutex_destroy(pthread_mutex_t *mutex); +int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling); +int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling); + +/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not + * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support + * this kind of Mutex */ +int pthread_mutexattr_init(pthread_mutexattr_t *attr); +int pthread_mutexattr_destroy(pthread_mutexattr_t *attr); +int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type); +int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol); +int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int); +int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling); +int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling); + +/* Spinlocks */ +int pthread_spin_init(pthread_spinlock_t *lock, int pshared); +int pthread_spin_destroy(pthread_spinlock_t *lock); +int pthread_spin_lock(pthread_spinlock_t *lock); +int pthread_spin_trylock(pthread_spinlock_t *lock); +int pthread_spin_unlock(pthread_spinlock_t *lock); + +/* Condition variables */ +int pthread_condattr_init(pthread_condattr_t *attr); +int pthread_condattr_destroy(pthread_condattr_t *attr); +int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared); +int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared); +int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock); +int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock); +int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr); +int pthread_cond_destroy(pthread_cond_t *cond); +int pthread_cond_signal(pthread_cond_t *cond); +int pthread_cond_broadcast(pthread_cond_t *cond); +int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); +int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time); + +/* Barriers */ +int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count); +int pthread_barrier_destroy(pthread_barrier_t *barrier); +int pthread_barrier_wait(pthread_barrier_t *barrier); +int pthread_barrierattr_init(pthread_barrierattr_t *attr); +int pthread_barrierattr_destroy(pthread_barrierattr_t *attr); +int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared); + + +/*Read-Write locks*/ +int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *); +int pthread_rwlock_destroy(pthread_rwlock_t *); +int pthread_rwlockattr_init(pthread_rwlockattr_t *); +int pthread_rwlockattr_destroy(pthread_rwlockattr_t *); +int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *); +int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int); +int pthread_rwlock_rdlock(pthread_rwlock_t *); +int pthread_rwlock_tryrdlock(pthread_rwlock_t *); +int pthread_rwlock_wrlock(pthread_rwlock_t *); +int pthread_rwlock_trywrlock(pthread_rwlock_t *); +int pthread_rwlock_unlock(pthread_rwlock_t *); + + +/** please refer to POSIX standard document + */ +int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared); + +/** set CPU affinity attribute in thread attributes object. + + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [in] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpuset specified a CPU that was outside the set supported + by the kernel. (The kernel configuration option + CONFIG_NR_CPUS defines the range of the set supported by + the kernel data type used to represent CPU sets.) + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset); + +/** get CPU affinity attribute in thread attributes object. + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [out] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpusetsize is smaller than the size of the affinity mask + used by the kernel. + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset); + +/* TLS */ +int pthread_key_create(pthread_key_t *key, void (*destructor)(void*)); +int pthread_key_delete(pthread_key_t key); +int pthread_setspecific(pthread_key_t key, const void *value); +void *pthread_getspecific(pthread_key_t key); +int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); + +/** @} */ + +/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */ +int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr); +int pthread_fake_destroy(pthread_t thread); + +//amitkulk: move these to unistd.h after we move that header within qurt +int posix_memalign(void **memptr, size_t alignment, size_t size); +void exit(int status); +#ifdef __cplusplus +} +#endif + +#endif /* QURT_PTHREAD_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread_types.h new file mode 100755 index 0000000000000..51c3b9dbca243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread_types.h @@ -0,0 +1,193 @@ +#ifndef _PTHREAD_TYPES_H_ +#define _PTHREAD_TYPES_H_ + +/*========================================================================== + * FILE: pthread_types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2016, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __GNUC__ +#define restrict __restrict__ +#else +#define restrict +#endif + +#define _SSIZE_T + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#define PTHREAD_MAX_THREADS 512U + +#define PTHREAD_NAME_LEN 16 +#define PTHREAD_MIN_STACKSIZE 512 //4096 +#define PTHREAD_MAX_STACKSIZE 1048576 +#define PTHREAD_DEFAULT_STACKSIZE 16384 + +#define PTHREAD_STACK_MIN (4096U*2U) +#define PTHREAD_MIN_PRIORITY 0U +#define PTHREAD_MAX_PRIORITY 255U +#define PTHREAD_DEFAULT_PRIORITY 1 + +/*Mutex initialization status*/ +#define PTHREAD_MUTEX_ATTR_UNINITIALIZED 0 +#define PTHREAD_MUTEX_ATTR_INITIALIZED 1 + +/*Conditional attributes initialization status*/ +#define PTHREAD_COND_ATTR_UNINITIALIZED 0 +#define PTHREAD_COND_ATTR_INITIALIZED 1 + +#define PTHREAD_DEFAULT_NAME "Anonymous" + +#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t) 0xFFFFFFFFU) + +#define PTHREAD_COND_INITIALIZER ((pthread_cond_t) 0xFFFFFFFFU) + +/* mutex and cond_var shared */ +#define PTHREAD_PROCESS_PRIVATE 0 +#define PTHREAD_PROCESS_SHARED 1 + +/* mutex type */ +#define PTHREAD_MUTEX_ERRORCHECK 0 +#define PTHREAD_MUTEX_NORMAL 1 +#define PTHREAD_MUTEX_RECURSIVE 2 +#define PTHREAD_MUTEX_DEFAULT 3 + +/* mutex protocol */ +#define PTHREAD_PRIO_NONE 0 +#define PTHREAD_PRIO_INHERIT 1 +#define PTHREAD_PRIO_PROTECT 2 + +#define PTHREAD_SPINLOCK_UNLOCKED 0 +#define PTHREAD_SPINLOCK_LOCKED 1 + +#define PTHREAD_ONCE_INIT (0) + +#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug + +typedef signed int ssize_t; + +/*detatchstate of a pthread*/ +#define PTHREAD_CREATE_JOINABLE 1 +#define PTHREAD_CREATE_DETACHED 0 + +/*contention scope*/ +#define PTHREAD_SCOPE_PROCESS 1 +#define PTHREAD_SCOPE_SYSTEM 0 + +/*scheduler*/ +#define PTHREAD_INHERIT_SCHED 1 +#define PTHREAD_EXPLICIT_SCHED 0 + +/* + * Types and structure definitions + * + */ +typedef unsigned int cpu_set_t; + +typedef unsigned int pthread_t; + +typedef struct pthread_attr_t +{ + void *stackaddr; + int internal_stack; /* this flag==1 means the stack needs to be freed by posix */ + size_t stacksize; + int priority; + unsigned short timetest_id; + /* This flag indicate if thread will be autostack thread*/ + unsigned short autostack:1; + /* This flag is to indicate thread's bus_priority high/low + bus_priority = 0 -- Bus_priority is low + bus_priority = 1 -- Bus_priority is high + bus_priority = 3 -- Bus_priority is default (takes the default set for the process) + */ + unsigned short bus_priority:2; + unsigned short reserved:13; + cpu_set_t cpumask; + char name[PTHREAD_NAME_LEN]; + /* This flag indicates whether pthread lib should create thread contexts for other OSALs */ + /* This is used internally by POSIX and not available for general usage */ + int ext_context; + int detachstate; +} pthread_attr_t; + +//mutex attr +typedef struct pthread_mutexattr_t pthread_mutexattr_t; +struct pthread_mutexattr_t +{ + int is_initialized; + int type; + int pshared; + int protocol; +}; + +typedef unsigned int pthread_mutex_t; + +typedef unsigned int pthread_spinlock_t; + +typedef struct pthread_condattr_t +{ + int is_initialized; + int pshared; + clockid_t clock_id; +} pthread_condattr_t; + +typedef unsigned int pthread_cond_t; + +typedef struct pthread_barrierattr_t +{ + int is_initialized; + int pshared; +} pthread_barrierattr_t; + +typedef unsigned int pthread_barrier_t; + +typedef int pthread_key_t; + +typedef int pthread_once_t; + + +/*Read-Write locks*/ +#define PTW32_RWLOCK_MAGIC 0xfacade2 +#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1) + +struct pthread_rwlockattr_t_ +{ + int pshared; +}; + +struct pthread_rwlock_t_ +{ + pthread_mutex_t mtxExclusiveAccess; + pthread_mutex_t mtxSharedAccessCompleted; + pthread_cond_t cndSharedAccessCompleted; + int nSharedAccessCount; + int nExclusiveAccessCount; + int nCompletedSharedAccessCount; + int nMagic; +}; + +typedef struct pthread_rwlock_t_ * pthread_rwlock_t; +typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t; +#ifdef __cplusplus +} +#endif + +#endif /* _PTHERAD_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sched.h new file mode 100755 index 0000000000000..faf3365be9f82 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sched.h @@ -0,0 +1,21 @@ +/*============================================================================= + + sched.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SCHED_H__ +#define __SCHED_H__ + +#include "sys/sched.h" + +#endif //__SCHED_H__ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/semaphore.h new file mode 100755 index 0000000000000..d9145b295ae62 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/semaphore.h @@ -0,0 +1,114 @@ +#ifndef SEMAPHORE_H +#define SEMAPHORE_H + +/*========================================================================== + * FILE: semaphore.h + * + * SERVICES: POSIX semaphore API interface + * + * DESCRIPTION: POSIX semaphore API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ +#include // Get all C sys types - includes POSIX specific +#include "sys/errno.h" // error values + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** User facing semaphore container with opaque pointer to implementation */ +typedef struct +{ + unsigned int *opaque; +} sem_t; +#define _SEM_T + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* constant definitions */ +#define SEM_FAILED ((sem_t*) 0) + +/* @todo siqbal Should we put such configuration items in a common place + instead of this user-facing header? */ +#define SEM_VALUE_MAX ((unsigned int) 30) // If need be increase this + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/** \details + * POSIX standard comes with two kinds of semaphores: named and unnamed + * semaphores. + * + * This implementation of POSIX kernel API provide unnamed & named semaphore. + * + * + * sem_timedwait() is not provided. + */ + +/** \defgroup semaphore POSIX Semaphore API */ + +/** \ingroup semaphore */ +/** @{ */ + +/** Initialize an unnamed semaphore. + * Please refer to POSIX standard for details. + * @param pshared [in] This implementation does not support non-zero value, + * i.e., semaphore cannot be shared between processes in this implementation. + */ +int sem_init(sem_t *sem, int pshared, unsigned int value); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_wait(sem_t *sem); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_trywait(sem_t *sem); + +/** Unlock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_post(sem_t *sem); + +/** Get the value of a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_getvalue(sem_t *sem, int *value); + +/** Destroy an unnamed semaphore. + * Please refer to POSIX standard for details. + */ +int sem_destroy(sem_t *sem); + +/** creates and initializes a named semaphore. + * Please refer to POSIX standard for details. + */ +sem_t * sem_open(const char* name , int oflag , ...); + +/** closes a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_close(sem_t *sem); + +/** unlinkes a named semaphore. + * Please refer to POSIX standard for details. + */ +int sem_unlink(const char *name); +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* SEMAPHORE_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/signal.h new file mode 100755 index 0000000000000..35cb1f1a9a319 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/signal.h @@ -0,0 +1,201 @@ +#ifndef _SIGNAL_H_ +#define _SIGNAL_H_ + +/*========================================================================== + * FILE: signal.h + * + * SERVICES: POSIX Signal API interface + * + * DESCRIPTION: POSIX Signal API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* POSIX signal bits */ + +#define POSIX_MSG 7 /* POSIX msg type used in Qube API */ +#define POSIX_NOTIF 8 /* POSIX msg type used in Qube API */ +#define SIGKILL 9 /* kill (cannot be caught or ignored) */ + +#define SIGRTMIN 10 +#define SIGRTMAX 32 + +/* Notification Types. */ +/* No asynchronous notification is delivered when the event of interest occurs. */ +#define SIGEV_NONE 0 +/* The signal specified in sigev_signo shall be generated for the process when + the event of interest occurs. */ +#define SIGEV_SIGNAL 1 +/* A notification function is called to perform notification. */ +#define SIGEV_THREAD 2 +#define SA_SIGINFO 1 + +/* + * Flags for sigprocmask: + */ +#define SIG_BLOCK 1 /* block specified signal set */ +#define SIG_UNBLOCK 2 /* unblock specified signal set */ +#define SIG_SETMASK 3 /* set specified signal set */ + +typedef unsigned long int sigset_t; + +union sigval +{ + int sival_int; /* Integer signal value. */ + void *sival_ptr; /* Pointer signal value. */ +}; + +typedef struct sigevent sigevent; +struct sigevent +{ + int sigev_notify; /* Notification type. */ + int sigev_signo; /* Signal number. */ + union sigval sigev_value; /* Signal value. */ + void (*sigev_notify_function)(union sigval); /* Notification function. */ + pthread_attr_t *sigev_notify_attributes; +}; + +typedef struct siginfo_t siginfo_t; +struct siginfo_t +{ + int si_signo; + int si_code; + union sigval si_value; +/* int si_errno; + pid_t si_pid; + uid_t si_uid; + void *si_addr; + int si_status; + long si_band;*/ +}; +struct sigaction +{ + void (*sa_handler)(int); + sigset_t sa_mask; + int sa_flags; + void (*sa_sigaction)(int, siginfo_t *, void *); +}; + +/* Signal functions */ + +/** \details + * This provides POSIX Signal API. Please note that this + * implementation does not fully comply with POSIX standard. + * + * In POSIX standard, Signal can be used as 'interrupt', which means + * an incoming signal will interrupt a running thread. After the + * registered signal handler is executed, the thread will resume. + * This behavior cannot be implemented w/o modifying L4 or QURT kernel. + * On the ohter hand, appliation need to be carefully written to avoid + * problems caused by 'interrupting' signals. + * + * Therefore, in this implementation of POSIX signal, thread will + * only receive signals when it explicitly waits for signals, i.e., when + * the thread calls either sigwait() or sigsuspend(). + * + * Therefore, pthread_sigmask(), which set or get signal mask for a thread, + * is not supported, since the signal mask will be set by sigwait() and + * sigsuspend(). + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * only threads can send and receive signals. The functions related to + * signal operations with processes, such as kill(), sigqueue(), + * sigprocmask(), are not provided. + * + * Queued signal is not supported. + * + * Applications will use signals from SIGRTMIN to SIGRTMAX. + * + * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not + * supported. + * + */ + +/** \defgroup signal POSIX Signal API */ +/** \ingroup signal */ +/** @{ */ + +/** Wait for signals. This implementation does not support queued signals. + * + * Please refer to POSIX standard for details. + */ +int sigwait(const sigset_t *restrict set, int *restrict sig); + +/** Examine and Change Signal Action. + * Please refer to POSIX standard for details. + * + * @param act [in] A pointer to the sigaction structure that describes the + * action to be taken for the signal. Can be NULL. + * The following flags for sa_flags field in struct sigaction are not + * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, + * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported. + * + * @note Define sigaction as macro to avoid a warning when included from + * C++ code - it's causing a "sigaction(...) hides constructor for + * 'struct sigaction'" warning. + */ +/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */ +#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact)) + +/** Wait for signals. + * Please refer to POSIX standard for details. + */ +int sigsuspend(const sigset_t *sigmask); + +/** Add Signal to Signal Set. + * Please refer to POSIX standard for details. + */ +int sigaddset(sigset_t *set, int signo); + +/** Delete Signal from Signal Set. + * Please refer to POSIX standard for details. + */ +int sigdelset(sigset_t *set, int signo); + +/** Initialize and Empty Signal Set. + * Please refer to POSIX standard for details. + */ +int sigemptyset(sigset_t *set); + +/** Initialize and Fill Signal Set. + * Please refer to POSIX standard for details. + */ +int sigfillset(sigset_t *set); + +/** Test for Signal in Signal Set. + * Please refer to POSIX standard for details. + */ +int sigismember(const sigset_t *set, int signo); + +/** @} */ + +/* this is not a public api function */ +int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact); + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +/** Wait for the time interval specified in the timespec structure referenced + * by timeout. This implementation does not support queued signals. + * For struct siginfo_t, si_code and si_value are ignored in this implementation. + * + * Please refer to POSIX standard for details. + */ +int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, + const struct timespec *restrict timeout); + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SIGNAL_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/errno.h new file mode 100755 index 0000000000000..b9edf57bab6c3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/errno.h @@ -0,0 +1,20 @@ +#ifndef _SYS_ERRNO_H_ +#define _SYS_ERRNO_H_ + +/*========================================================================== + * FILE: errno.h + * + * SERVICES: POSIX errno header file + * + * DESCRIPTION: POSIX errno based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#ifndef EOK +#define EOK 0 +#endif + +#endif /* _SYS_ERRNO_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/sched.h new file mode 100755 index 0000000000000..2acc34d821725 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/sched.h @@ -0,0 +1,67 @@ +#ifndef _POSIX_SCHED_H_ +#define _POSIX_SCHED_H_ + +/*========================================================================== + * FILE: sched.c + * + * SERVICES: POSIX Thread sched API interface + * + * DESCRIPTION: POSIX Thread sched API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCHED_FIFO 0 /* First in, first out (FIFO) scheduling policy. */ +#define SCHED_RR 1 /* Round robin scheduling policy. */ +#define SCHED_SPORADIC 2 /* Sporadic server scheduling policy. */ +#define SCHED_OTHER 3 /* Another scheduling policy. */ + +typedef struct sched_param sched_param; +struct sched_param +{ + void *unimplemented; + int sched_priority; +}; + +/** \details + * This provides POSIX sched API. + */ + +/** \defgroup sched POSIX sched API */ +/** \ingroup sched */ +/** @{ */ + +/** Relinquish the CPU. + * Please refer to POSIX standard for details. + */ +static inline int sched_yield(void) +{ + return 0; +} + +/** Get the maximum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_max(int policy); + +/** Get the minimum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_min(int policy); + +/** @} */ +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SCHED_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/types.h new file mode 100755 index 0000000000000..700026f9f9e4e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/types.h @@ -0,0 +1,35 @@ +#ifndef _SYS_TYPES_H_ +#define _SYS_TYPES_H_ + +/*========================================================================== + * FILE: types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#if !defined( _PID_T ) || !defined( __pid_t_defined ) +/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header + defines it as unsigned 32-bit type citing conflict with QuRT POSIX + compatibility later. If any such conflicts exist, we should fix them. + pid_t is being defined *BEFORE* inclusion of generic/sys/types.h + *INTENTIONALLY* to fix this */ +typedef int pid_t; +#define _PID_T +#define __pid_t_defined +#endif +#include +#include +#include +#include + +#ifndef __DEFINED_off_t +typedef long off_t; +#define __DEFINED_off_t +#endif + +#endif /* _SYS_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/time.h new file mode 100755 index 0000000000000..13aeb1ea9920d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/time.h @@ -0,0 +1,142 @@ +#ifndef _POSIX_TIME_H_ +#define _POSIX_TIME_H_ + +/*========================================================================== + * FILE: time.h + * + * SERVICES: POSIX Timer API interface + * + * DESCRIPTION: POSIX Timer API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *==========================================================================*/ + + +#include + +typedef int clockid_t; /* ignored */ +#define _CLOCKID_T +#define _PROVIDE_POSIX_TIME_DECLS 1 +#include +/* @todo anandj sys/time.h has definition for struct timeval but is not + included by generic/time.h */ +#include + +#define CLOCK_FREQ_NOT_DEFINED -1 +/* Frequency of Sclk used */ +#define TIME_CONV_SCLK_FREQ 19200000 + +#define RES_CONV_FACTOR1 1 +#define RES_CONV_FACTOR2 1000000000 + +#if !defined(CLOCK_REALTIME) +# define CLOCK_REALTIME 0 +#endif + +#if !defined(CLOCK_MONOTONIC) +# define CLOCK_MONOTONIC 1 +#endif + +#if !defined(CLOCK_THREAD_CPUTIME_ID) +# define CLOCK_THREAD_CPUTIME_ID 2 +#endif + +#if !defined(CLOCK_PROCESS_CPUTIME_ID) +# define CLOCK_PROCESS_CPUTIME_ID 3 +#endif + +#if !defined(CLOCK_MONOTONIC_RAW) +# define CLOCK_MONOTONIC_RAW 4 +#endif + +#if !defined(CLOCK_REALTIME_COARSE) +# define CLOCK_REALTIME_COARSE 5 +#endif + +#if !defined(CLOCK_MONOTONIC_COARSE) +# define CLOCK_MONOTONIC_COARSE 6 +#endif + +#if !defined(CLOCK_BOOTTIME) +# define CLOCK_BOOTTIME 7 +#endif + +struct itimerspec +{ + struct timespec it_interval; /* Timer period. */ + struct timespec it_value; /* Timer expiration. */ +}; + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Timer functions */ + +/** \details + * POSIX timers can be either of two types: a one-shot type or a periodic + * type. + * + * A one-shot is an armed timer that is set to an expiration time relative + * to either a current time or an absolute time. The timer expires once and + * is disarmed. + * + * A periodic timer is armed with an initial expiration time and a repetition + * interval. Every time the interval timer + * expires, the timer is reloaded with the repetition interval. The timer + * is then rearmed. + */ + +/** \defgroup timer POSIX Timer API */ + +/** \ingroup timer */ +/** @{ */ + +/** Create a POSIX timer. + * Please refer to POSIX standard for details. + * @param clockid [in] ignored in this implementation + * @param evp [in] if non-NULL, points to a sigevent structure. This + * structure, allocated by the application, defines the asynchronous + * notification to occur when the timer expires. If the evp argument is + * NULL, the effect is as if the evp argument pointed to a sigevent + * structure with the sigev_notify member having the value SIGEV_SIGNAL, + * the sigev_signo having a default signal number (SIGALRM), and the + * sigev_value member having the value of the timer ID. + */ +int timer_create(clockid_t clockid, struct sigevent *restrict evp, + timer_t *restrict timerid); + +/** Delete a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_delete(timer_t timerid); + +/** Get the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_gettime(timer_t timerid, struct itimerspec *value); + + +/** Set the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + * @param flags [in] ignored in this implementation + */ +int timer_settime(timer_t timerid, int flags, + const struct itimerspec *restrict value, + struct itimerspec *restrict ovalue); +/** Obtain ID of a process CPU-time clock + * @param pid [in] Process ID + * @param clock_id [out] Clock ID + * @return Error values as per POSIX standard + */ +int clock_getcpuclockid (pid_t pid, clockid_t * clock_id); +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_TIME_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qube/qube.h new file mode 100755 index 0000000000000..1e31e2deedb38 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qube/qube.h @@ -0,0 +1,51 @@ +#ifndef QUBE_H +#define QUBE_H +/*============================================================================= + + qube.h -- H E A D E R F I L E + +GENERAL DESCRIPTION + Prototypes of qpd API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Define Error codes as QuRT error codes preceed with QURT_ */ +#ifndef EOK +#define EOK QURT_EOK +#endif /* EOK */ +#ifndef EVAL +#define EVAL QURT_EVAL +#endif /* EVAL */ +#ifndef EMEM +#define EMEM QURT_EMEM +#endif /* EMEM */ +#ifndef EINVALID +#define EINVALID QURT_EINVALID +#endif /* EINVALID */ + + +/*============================================================================= + FUNCTION DECLARATIONS +=============================================================================*/ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QUBE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops.h new file mode 100755 index 0000000000000..0a9a9f8ba7db5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops.h @@ -0,0 +1,197 @@ +#ifndef ATOMIC_OPS_H +#define ATOMIC_OPS_H +/** + @file atomic_ops.h + + @brief Type definitions backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * Author: Carlos Dyonisio + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned int atomic_plain_word_t; + +/*-------------------------------------------------------------------------*/ + /* Atomic Ops API. */ + +/* + * IMPORTANT! + * If you plan to change the structure atomic_word_t, please add the new + * elements after value. For more information, read the comment in + * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66 + */ + +typedef struct { + volatile atomic_plain_word_t value; +} atomic_word_t; + +#define ATOMIC_INIT(i) { (i) } + +static inline void +atomic_init(atomic_word_t *a, atomic_plain_word_t v) +{ + a->value = v; +} + +#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \ + (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP)) + +/* + * If it is ARMv4/v5, the function declarations may change + * and are defined in the arch specific header file, + * as some of then cannot be declared static because of + * the assembler implementation. + */ + +#else + +/* Arithmetic operations. */ + +void atomic_sub(atomic_word_t *target, atomic_plain_word_t v); + +/* Architecture independent definitions. */ + +static inline atomic_plain_word_t atomic_read(atomic_word_t *target) +{ + return target->value; +} + +typedef unsigned long long atomic64_plain_word_t; + +typedef struct { + volatile atomic64_plain_word_t value; +} atomic64_word_t; + +static inline void +atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v) +{ + a->value = v; +} + +/********************* + Support 64-bit + *********************/ + +atomic64_plain_word_t atomic64_set(atomic64_word_t* target, + atomic64_plain_word_t value); + +void atomic64_xor(atomic64_word_t* target, + atomic64_plain_word_t mask); + +/*---------------------------------------------------------------------------*/ + +/* Architecture independent definitions. */ + +static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target) +{ + return target->value; +} + +#endif + + +/* Architecture dependent definitions. */ +#include + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops_plat.h new file mode 100755 index 0000000000000..b54b3ff83d978 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops_plat.h @@ -0,0 +1,86 @@ +#ifndef ATOMIC_OPS_PLAT_H +#define ATOMIC_OPS_PLAT_H +/** + @file atomic_ops_plat.h + + @brief Prototypes of atomic operations API backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define atomic_set(a,b) qurt_atomic_set((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and(a,b) qurt_atomic_and((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and_return(a,b) qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or(a,b) qurt_atomic_or((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or_return(a,b) qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor(a,b) qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor_return(a,b) qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_set_bit(a,b) qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_clear_bit(a,b) qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_change_bit(a,b) qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add(a,b) qurt_atomic_add((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_return(a,b) qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_unless(a,b,c) qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_sub(a,b) qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b)) +#define atomic_sub_return(a,b) qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_inc(a) qurt_atomic_inc((unsigned int *)(a)) +#define atomic_inc_return(a) qurt_atomic_inc_return((unsigned int *)(a)) +#define atomic_dec(a) qurt_atomic_dec((unsigned int *)(a)) +#define atomic_dec_return(a) qurt_atomic_dec_return((unsigned int *)(a)) +#define atomic_compare_and_set(a,b,c) qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_barrier qurt_atomic_barrier +#define atomic_barrier_write qurt_atomic_barrier_write +#define atomic_barrier_write_smp qurt_atomic_barrier_write_smp +#define atomic_barrier_read_smp qurt_atomic_barrier_read_smp +#define atomic_barrier_smp qurt_atomic_barrier_smp + +/*============================ + * 64 bits support + *============================ */ +#define atomic64_set(a,b) qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and(a,b) qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and_return(a,b) qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or(a,b) qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or_return(a,b) qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor(a,b) qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor_return(a,b) qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_set_bit(a,b) qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_clear_bit(a,b) qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_change_bit(a,b) qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add(a,b) qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add_return(a,b) qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub(a,b) qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub_return(a,b) qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_inc(a) qurt_atomic64_inc((unsigned long long *)(a)) +#define atomic64_inc_return(a) qurt_atomic64_inc_return((unsigned long long *)(a)) +#define atomic64_dec(a) qurt_atomic64_dec((unsigned long long *)(a)) +#define atomic64_dec_return(a) qurt_atomic64_dec_return((unsigned long long *)(a)) +#define atomic64_compare_and_set(a,b,c) qurt_atomic64_compare_and_set((unsigned long long *)(a),(unsigned long long )(b),(unsigned long long )(c)) +#define atomic64_barrier qurt_atomic64_barrier +#define atomic64_barrier_write qurt_atomic64_barrier_write +#define atomic64_barrier_write_smp qurt_atomic64_barrier_write_smp +#define atomic64_barrier_read_smp qurt_atomic64_barrier_read_smp +#define atomic64_barrier_smp qurt_atomic64_barrier_smp + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_PLAT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt.h new file mode 100755 index 0000000000000..4d25c9b2b6243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt.h @@ -0,0 +1,111 @@ +#ifndef QURT_H +#define QURT_H + +/** + @file qurt.h + @brief Contains kernel header files that provide kernel OS API functions, constants, and + definitions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013,2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +/*====================================================================== + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Notice that changes are listed in reverse chronological + * order. + * + * + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------ + * 2011-02-25 op Add Header file + 2012-12-16 cm (Tech Pubs) Edited/added Doxygen comments and markup. + ======================================================================*/ + + +#ifdef __cplusplus +extern "C" { +#endif + +#include "qurt_consts.h" +#include "qurt_api_version.h" +#include "qurt_alloc.h" +#include "qurt_futex.h" +#include "qurt_mutex.h" +#include "qurt_pipe.h" +#include "qurt_printf.h" +#include "qurt_assert.h" +#include "qurt_thread.h" +#include "qurt_trace.h" +#include "qurt_cycles.h" +#include "qurt_profile.h" +#include "qurt_sem.h" +#include "qurt_cond.h" +#include "qurt_barrier.h" +#include "qurt_fastint.h" +#include "qurt_allsignal.h" +#include "qurt_anysignal.h" +#include "qurt_signal.h" +#include "qurt_rmutex.h" +#include "qurt_pimutex.h" +#include "qurt_signal2.h" +#include "qurt_rmutex2.h" +#include "qurt_pimutex2.h" +#include "qurt_int.h" +#include "qurt_lifo.h" +#include "qurt_power.h" +#include "qurt_event.h" +#include "qurt_pmu.h" +#include "qurt_stid.h" +//#include "qurt_version.h" +#include "qurt_tlb.h" +#include "qurt_vtlb.h" +#include "qurt_memory.h" +#include "qurt_qdi.h" +#include "qurt_sclk.h" +#include "qurt_space.h" +#include "qurt_process.h" +#include "qurt_timer.h" +#include "qurt_tls.h" +#include "qurt_thread_context.h" +#include "qurt_hvx.h" +#include "qurt_hmx.h" +#include "qurt_mailbox.h" +#include "qurt_island.h" +#include "qurt_qdi_proxy.h" +#include "qurt_l2cfg.h" +#include "qurt_mmap.h" +#include "qurt_isr.h" +#include "qurt_busywait.h" +#include "qurt_ecc.h" +#include "qurt_callback.h" +#include "qurt_error.h" +#include "qurt_except.h" +#include "qurt_mq.h" +#include "qurt_user_dma.h" +#include "qurt_fs_hub.h" +#include "qurt_os_services.h" + +#ifndef MAIN_ONLY +#define INCLUDE_ISLAND_CONTENTS +#endif +#ifndef ISLAND_ONLY +#define INCLUDE_MAIN_CONTENTS +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_alloc.h new file mode 100755 index 0000000000000..da37a4c0a714e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_alloc.h @@ -0,0 +1,145 @@ +#ifndef QURT_ALLOC_H +#define QURT_ALLOC_H + +/** + @file qurt_alloc.h + @brief Prototypes of kernel memory allocation API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_malloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated memory area. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] size Size (in bytes) of the memory area. + + @return + Nonzero -- Pointer to the allocated memory area. \n + 0 -- Not enough memory in heap to allocate memory area. + + @dependencies + None. + + */ +/* ======================================================================*/ +void *qurt_malloc( unsigned int size); + +/*======================================================================*/ +/**@ingroup func_qurt_calloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated array. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] elsize Size (in bytes) of each array element. + @param[in] num Number of array elements. + + @return + Nonzero -- Pointer to allocated array.\n + Zero -- Not enough memory in heap to allocate array. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_calloc(unsigned int elsize, unsigned int num); + +/*======================================================================*/ +/**@ingroup func_qurt_realloc + Reallocates memory on the heap. \n + Changes the size of a memory area that is already allocated on the QuRT system heap. + The reallocate memory operation is functionally similar to realloc. It accepts a pointer + to an existing memory area on the heap, and resizes the memory area to the specified size + while preserving the original contents of the memory area. + + @note1hang This function might change the address of the memory area. + If the value of ptr is NULL, this function is equivalent to + qurt_malloc(). + If the value of new_size is 0, it is equivalent to qurt_free(). + If the memory area is expanded, the added memory is not initialized. + + @param[in] *ptr Pointer to the address of the memory area. + @param[in] newsize Size (in bytes) of the reallocated memory area. + + @return + Nonzero -- Pointer to reallocated memory area. \n + 0 -- Not enough memory in heap to reallocate the memory area. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_realloc(void *ptr, int newsize); + +/*======================================================================*/ +/**@ingroup func_qurt_free + Frees allocated memory from the heap.\n + Deallocates the specified memory from the QuRT system heap. + + @param[in] *ptr Pointer to the address of the memory to deallocate. + + @return + None. + + @dependencies + The memory item that the ptr value specifies must have been previously + allocated using one of the qurt_calloc(), + qurt_malloc(), or qurt_realloc() memory allocation functions. + Otherwise the behavior of QuRT is undefined. + + */ + /* ======================================================================*/ +void qurt_free( void *ptr); + + +void *qurt_memalign(unsigned int alignment, unsigned int size); + +/* +|| Macro to define a static heap for a QuRT program. +|| +|| Usage: +|| Declare at the top-level of any C source file that +|| is part of the build (and is guaranteed +|| to actually be pulled into the build). Place +|| it in the same function with main(): +|| +|| QURT_DECLARE_STATIC_HEAP(512000); +|| +|| The only argument is the size in bytes, and it is +|| rounded up to the nearest 64 bytes (size of an +|| L2 cache block). +|| +*/ + +#define QURT_DECLARE_STATIC_HEAP(sz) \ + static struct qurt_static_heap { \ + char space[(sz)] __attribute__((aligned(64))); \ + } static_heap[1]; \ + void * const override_heap_Base = &static_heap[0]; \ + void * const override_heap_Limit = &static_heap[1] + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLOC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_allsignal.h new file mode 100755 index 0000000000000..5dc89e495130d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_allsignal.h @@ -0,0 +1,176 @@ + +#ifndef QURT_ALLSIGNAL_H +#define QURT_ALLSIGNAL_H + +/** + @file qurt_allsignal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup all_signal_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** +qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int waiting; /**< */ + unsigned int signals_in; /**< */ + unsigned int queue; /**< */ + unsigned int reserved; /**< */ + }X; + /** @endcond */ +} qurt_allsignal_t; +/** @} */ /* end_addtogroup all_signal_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_init + Initializes an all-signal object.\n + The all-signal object is initially cleared. + + @datatypes + #qurt_allsignal_t + + @param[out] signal Pointer to the all-signal object to initialize. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_init(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_destroy + Destroys the specified all-signal object.\n + @note1hang All-signal objects must be destroyed when they are no longer in use. + Failure to do this causes resource leaks in the QuRT kernel. \n + @note1cont All-signal objects must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_destroy(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_get + Gets signal values from the all-signal object. + + Returns the current signal values of the specified all-signal object. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to access. + + @return + Bitmask with current signal values. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal) +{ return signal->X.signals_in; } + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_wait + Waits on the all-signal object.\n + Suspends the current thread until all of the specified signals are set. + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 that it is not to be waited on. + + If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal + object before waiting on them again -- clearing is done automatically by the wait + operation. + + @note1hang At most, one thread can wait on an all-signal object at any given time. + Because signal clearing is done by the wait operation, no clear operation is + defined for all-signals. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to wait on. + @param[in] mask Signal mask value, which identifies the individual signals in the all-signal object + to wait on. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_set + Set signals in the specified all-signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit + value of 1 indicates that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the all-signal object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLSIGNAL_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_anysignal.h new file mode 100755 index 0000000000000..9619e2de562b4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_anysignal.h @@ -0,0 +1,225 @@ +#ifndef QURT_ANYSIGNAL_H +#define QURT_ANYSIGNAL_H +/** + @file qurt_anysignal.h + Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +Copyright (c) 2021 Qualcomm Technologies, Inc. +All rights reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== +Typedefs +======================================================================*/ + +/**@ingroup anysignals_types + qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility. */ +typedef qurt_signal_t qurt_anysignal_t; + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_init + Initializes an any-signal object.\n + The any-signal object is initially cleared. + + @datatypes + #qurt_anysignal_t + + @param[out] signal Pointer to the initialized any-signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_init(qurt_anysignal_t *signal) +{ + qurt_signal_init(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_destroy + Destroys the specified any-signal object. + + @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Any-signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal) +{ + qurt_signal_destroy(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait + Wait on the any-signal object. \n + Suspends the current thread until any one of the specified signals is set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + + @return + Bitmask of current signal values. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_set + Sets signals in the specified any-signal object. \n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be set, and 0 indicates not to set the sigmal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the any-signal object. + + @return + Bitmask of old signal values (before set). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask); + + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_get + Gets signal values from the any-signal object.\n + Returns the current signal values of the specified any-signal object. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to access. + + @return + A bitmask with the current signal values of the specified any-signal object. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal) +{ + return qurt_signal_get(signal); +} + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_clear + @xreflabel{sec:anysignal_clear} + Clears signals in the specified any-signal object.\n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + clear in the any-signal object. + + @return + Bitmask -- Old signal values (before clear). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait_timed + Waits on the any-signal object. \n + Suspends the current thread until any of the specified signals is set or timeout expires. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + @param[out] signals Bitmask of current signal values. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- timeout + #QURT_EINVALID -- Duration out of range + + @dependencies + None. + */ +/* ======================================================================*/ + +int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ANYSIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_api_version.h new file mode 100755 index 0000000000000..dfe53ae755054 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_api_version.h @@ -0,0 +1,77 @@ +#ifndef QURT_API_VERSION_H +#define QURT_API_VERSION_H +/*============================================================================== + +qurt_api_version.h + +GENERAL DESCRIPTION + API version file + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +/*============================================================================== + CONSTANTS AND DEFINITIONS +==============================================================================*/ +/** + * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer. + * Main release has first 3 fields updated - Major, Minor and Release. + * - QURT_API_VERSION = Major, Minor, Release. + * Patch releases are supported by adding the extra field. + * - QURT_API_VERSION = Major, Minor, Release, Patch. + */ +// Major version is incremented for incompatible API changes. +#define QURT_API_VER_MAJOR 1 + +// Minor version is incremented for backward-compatible enhancements in the API +// set. +#define QURT_API_VER_MINOR 4 + +// RELEASE version is incremented for each release within a `MAJOR.MINOR` +// release. +#define QURT_API_VER_RELEASE 1 + +// Patch version is incremented when new API content is introduced on older LTS +// release. +#define QURT_API_VER_PATCH 0 + +/* Update the QURT_API_VERSION function macro. */ +#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \ + ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \ + (((release) & 0xFF) << 8) | ((patch) & 0xFF)) + +/* Update the QURT_API_VERSION Macro. */ +#define QURT_API_VERSION \ + QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \ + QURT_API_VER_RELEASE, QURT_API_VER_PATCH) + +/** Usage: + * + * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0) + * qurt_func_2(a,b,c); + * #else + * qurt_func(a); + * #endif + * + */ +/* + Gets the QuRT API version. + + @return + QuRT API version. + + @dependencies + None. + */ +unsigned int qurt_api_version(void); + +#endif /* QURT_API_VERSION_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_assert.h new file mode 100755 index 0000000000000..13cc2afd2e973 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_assert.h @@ -0,0 +1,51 @@ +#ifndef QURT_ASSERT_H +#define QURT_ASSERT_H +/** + @file qurt_assert.h + @brief Prototypes of qurt_assert API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@ingroup func_qurt_assert_error + Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel. + + @datatypes + None. + + @param[in] filename Pointer to the file name string. + @param[in] lineno Line number. + + @return + None. + + @dependencies + None. + */ +void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn)); + +#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__)) + +/** @} */ /* end_ingroup func_qurt_assert */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ASSERT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_atomic_ops.h new file mode 100755 index 0000000000000..d9b2cff7d737c --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_atomic_ops.h @@ -0,0 +1,1298 @@ +#ifndef QURT_ATOMIC_OPS_H +#define QURT_ATOMIC_OPS_H +/** + @file qurt_atomic_ops.h + @brief Prototypes of kernel atomic operations API. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * + * This file is only included by the main atomic_ops.h, so all of that + * file's definitions are available. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +///* Sanity check to ensure the smp flag is set in machines.py */ +//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1 +//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py. +//#endif +#define QURT_INLINE __attribute__((always_inline)) + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_atomic_set + Sets the atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value Value to set. + + @return + Value successfuly set. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_set(unsigned int* target, unsigned int value) +{ + unsigned long tmp; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " memw_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic_and + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + None + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_and(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_and_return + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + AND result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_and_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_or + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_or(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_or_return + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + Returns the OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_or_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_xor + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_xor(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_xor_return + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_xor_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_set_bit + Sets a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_set_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_clear_bit + Clears a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_clear_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_change_bit + Toggles a bit in a atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_change_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1fU; + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_add(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add_return + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_add_unless + Adds the delta value to an atomic variable unless the current value in the target + matches the unless variable. + + @note1hang The function retries until load lock and store conditional + are successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] delta Value to add to the current value. + @param[in] unless Perform the addition only when the current value is not + equal to this unless value. + @return + TRUE -- 1 - Addition was performed. \n + FALSE -- 0 - Addition was not done. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_unless(unsigned int* target, + unsigned int delta, + unsigned int unless) +{ + unsigned int current_val; + unsigned int new_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%3)\n" + " p0 = cmp.eq(%0, %5)\n" + " if p0 jump 2f\n" + " %1 = add(%0, %4)\n" + " memw_locked(%3, p0) = %1\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"=&r" (new_val),"+m" (*target) + : "r" (target), "r" (delta), "r" (unless) + : "p0"); + + return (unsigned int)(current_val != unless); +} + +/**@ingroup func_qurt_atomic_sub + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_sub(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_sub_return + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_sub_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_inc + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_inc(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_inc_return + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_inc_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_dec + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_dec(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_dec_return + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_dec_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_compare_and_set + Compares the current value of the atomic variable with the + specified value and set to a new value when compare is successful. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val Old value to compare. + @param[in] new_val New value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE --Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_compare_and_set(unsigned int* target, + unsigned int old_val, + unsigned int new_val) +{ + unsigned int current_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memw_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (unsigned int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic_barrier + Allows the compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_barrier(void) +{ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); +} + + +/**@ingroup func_qurt_atomic64_set + Sets the 64-bit atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value 64-bit value to set. + + @return + Successfuly set value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_set(unsigned long long* target, unsigned long long value) +{ + unsigned long long tmp; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " memd_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic64_and_return + Bitwise AND operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise AND. + + @return + AND result of 64-bit atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_or + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_or(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_or_return + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_xor_return + Bitwise XOR operation of 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_set_bit + Sets a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_clear_bit + Clears a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_change_bit + Toggles a bit in a 64-bit atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_add(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add_return + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_add_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_sub_return + Subtracts a 64-bit integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_inc + Increments a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_inc(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_inc_return + Increments a 64-bit atomic variable by one + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_inc_return(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_dec_return + Decrements a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_dec_return(unsigned long long *target) +{ + unsigned long long result; + long long minus1 = 0xFFFFFFFFFFFFFFFFLL; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (minus1) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_compare_and_set + Compares the current value of an 64-bit atomic variable with + the specified value and sets to a new value when compare is successful. + + @note1hang The function keep retrying until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val 64-bit old value to compare. + @param[in] new_val 64-bit new value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE -- Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE int +qurt_atomic64_compare_and_set(unsigned long long *target, + unsigned long long old_val, + unsigned long long new_val) +{ + unsigned long long current_val; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memd_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic64_barrier + Allows compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_barrier(void) +{ + /** @cond */ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); + /** @endcond */ +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_barrier.h new file mode 100755 index 0000000000000..7c6f787d43bc2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_barrier.h @@ -0,0 +1,140 @@ +#ifndef QURT_BARRIER_H +#define QURT_BARRIER_H + +/** + @file qurt_barrier.h + @brief Prototypes of Kernel barrier API functions. + + EXTERNALIZED FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup barrier_types +@{ */ +/*===================================================================== + Constants and macros +======================================================================*/ +#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */ +#define QURT_BARRIER_OTHER 0 /**< Other. */ + +#ifndef ASM +#include + +/*===================================================================== +Typedefs +======================================================================*/ + +/** QuRT barrier type. + */ +typedef union { + /** @cond */ + struct { + unsigned short threads_left; + unsigned short count; + unsigned int threads_total; + unsigned int queue; + unsigned int reserved; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_barrier_t; + +/** @} */ /* end_addtogroup barrier_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_init + Initializes a barrier object. + + @datatypes + #qurt_barrier_t + + @param[out] barrier Pointer to the barrier object to initialize. + @param[in] threads_total Total number of threads to synchronize on the barrier. + + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_destroy + Destroys the specified barrier. + + @note1hang Barriers must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Barriers must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to destroy. + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_destroy(qurt_barrier_t *barrier); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_wait + Waits on the barrier.\n + Suspends the current thread on the specified barrier. \n + The function return value indicates whether the thread was the last one to + synchronize on the barrier. + When a thread waits on a barrier, it is suspended on the barrier: \n + - If the total number of threads waiting on the barrier is less than the assigned value + of the barrier, no other action occurs. \n + - If the total number of threads waiting on the barrier equals the assigned value of the + barrier, all threads currently waiting on the barrier are awakened, allowing them to + execute past the barrier. + + @note1hang After its waiting threads are awakened, a barrier is automatically reset + and can be used again in the program without the need for re-initialization. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to wait on. + + @return + #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n + #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_wait(qurt_barrier_t *barrier); + + +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BARRIER_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_busywait.h new file mode 100755 index 0000000000000..a4dab80a2520a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_busywait.h @@ -0,0 +1,62 @@ +#ifndef QURT_BUSYWAIT_H +#define QURT_BUSYWAIT_H + +/** + @file qurt_busywait.h + @brief Implementation of the busywait() function for + hardware based blocking waits that use the QTIMER as a reference. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ============================================================================*/ +/*============================================================================= + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Changes are listed in reverse chronological + * order. + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------------- + * 2018-03-20 pg Add Header file + ============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_busywait + Pauses the execution of a thread for a specified time.\n + Use for small microsecond delays. + + @note1hang The function does not return to the caller until + the time duration has expired. + + @param[in] pause_time_us Time to pause in microseconds. + + @return + None. + + @dependencies + None. + */ +void qurt_busywait (unsigned int pause_time_us); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BUSYWAIT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_callback.h new file mode 100755 index 0000000000000..dc9b896c63454 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_callback.h @@ -0,0 +1,235 @@ +#ifndef QURT_CALLBACK_H +#define QURT_CALLBACK_H + +/** + @file qurt_callback.h + Definitions, macros, and prototypes for QuRT callback framework. + + QDI framework allows the development of root process drivers and services that + a user process client can interact with in a secure manner. QDI framework does + this by elevating the priviledge of user process thread, temporarily allowing + the thread execute in root context and letting it fall back to user context once + the QDI invocation is finished. + + The QuRT callback framework provides a safe mechanism for root process drivers + to execute callback functions in a user process. The framework hosts + dedicated worker threads in corresponding processes that handle the execution + of the callback function. This ensures that the callbacks occur in context of + the appropriate process thread, in result maintaining privilege boundaries. + + Prerequisites for use of this framework are: + 1. Driver is a QDI driver and client communicates with drivers using QDI + invocations. + 2. Appropriate callback configuration is specified in cust_config.xml for + the user process that intends to use this framework. + + qurt_cb_data_t is the public data structure that allows client to store all + the required information about the callback, including the callback function + and the arguments to pass to this function when it executes. + The client uses QDI interface to register this structure with root driver. + + Callback framework provides following APIs that a root driver can use to invoke callback. + These functions are described in qurt_qdi_driver.h header file. + + qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the + invoking thread does not wait for the callback to finish executing. + + qurt_qdi_cb_invoke_sync() triggers a synchronous callback. Upon invocation + the invoking thread gets suspended till the callback function finishes execution. + + qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to + qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with + the callback invocation to be utlized during the callback execution. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int qurt_cb_result_t; + +/* Callback framework error codes. + Callback framework returns a nonzero value if callback invocation is unsuccessful. + Following macros highlight cause of failure in more detail. +*/ +#define QURT_CB_ERROR -1 /* Callback registration failed.\n*/ +#define QURT_CB_OK 0 /* Success.\n*/ +#define QURT_CB_MALLOC_FAILED -2 /* QuRTOS malloc failure.\n*/ +#define QURT_CB_WAIT_CANCEL -3 /* Process exit cancelled wait operation.\n*/ +#define QURT_CB_CONFIG_NOT_FOUND -4 /* Callback configuration for process was not found.\n*/ +#define QURT_CB_QUEUE_FULL -5 /* Callback queue is serving at maximum capacity.*/ +/** @addtogroup cb_types +@{ */ +/** Callback registration data structure. + This data structure is used by a client attempting to register a callback with a QDI driver. + It holds the address of callback function and the argument supplied to the callback + function when it executes. +*/ +typedef struct { + /** @cond */ + void* cb_func; /*< Pointer to the callback function. */ + unsigned cb_arg; /*< Not interpreted by the framework.*/ + /** @endcond */ +} qurt_cb_data_t; + +/** @cond */ +/* Defines used as default if cust_config does not specify them. */ +#define CALLBACK_WORKER_STACK_SIZE 0x2000 +/** @endcond */ +/** @} */ /* end_addtogroup cb_typess */ +/**@ingroup func_qurt_cb_data_init + Initializes the callback data structure. + Entity registering a callback with the root process driver must call this function + to initialize callback registration data structure to the default value. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){ + cb_data->cb_func = NULL; + cb_data->cb_arg = 0; +} + +/**@ingroup func_qurt_cb_data_set_cbfunc + Sets up the callback function in the callback registration data structure. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_func Pointer to the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){ + cb_data->cb_func = cb_func; +} + +/**@ingroup func_qurt_cb_data_set_cbarg + Sets up the callback argument. + This function sets up the argument passed to the callback function when it executes. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_arg Argument for the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){ + cb_data->cb_arg = cb_arg; +} + +/** @cond */ +/**@ingroup driver_support_functions + Invokes an asynchronous callback for a specified process. + A driver that resides in the root process calls this API to launch a callback in + a process described by the client_handle. + After the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is not suspended during the callback execution period. + The API returns immediately with a success/failure error code. + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process. + A driver that resides in a root process calls this API to launch a sync callback in + a process described by the client_handle. + AFter the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is suspended during the callback execution period. + If the process in which to execute the callback exits or terminates, the caller is + woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h). + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process, passing driver data to the user PD. + This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to + the user process as part of the callback invocation. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @param data Driver arbitrary data to pass to the user process. Memory pointed to by data + must be accessible to the user PD. The root driver can allocate such memory by + using qurt_mem_mmap(). + @param data_len Driver arbitrary data length. + + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle, + qurt_cb_data_t* cb_data, + int prio, + void *data, + unsigned data_len + ); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_clade.h new file mode 100755 index 0000000000000..d7442cf98dd94 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_clade.h @@ -0,0 +1,62 @@ +#ifndef QURT_CLADE_H +#define QURT_CLADE_H +/** + @file qurt_clade.h + @brief Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API. + CLADE is a cache line level memory compression system that is used to + decrease DRAM usage. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_clade2_get + Reads the value of the clade2 register. + + @param[in] offset Offset from the clade2 cfg base. + @param[out] *value Pointer to the register value read from the offset. + + @return + #QURT_EOK - Successfully read the value from the register at offset \n + #QURT_EINVALID - Offset passed is incorrect + + @dependencies + None. + */ +int qurt_clade2_get(unsigned short offset, unsigned int *value); + +/**@ingroup func_qurt_clade2_set + Sets the PMU register; only PMU_SEL register can be set. + + @param[in] offset Offset from the QURTK_clade2_cfg_base. + @param[in] value Value to set at offset. + + @return + #QURT_EOK -- Successfully set the value at offset. \n + #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG. + + @dependencies + None. + */ +int qurt_clade2_set(unsigned short offset, unsigned int value); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CLADE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cond.h new file mode 100755 index 0000000000000..6e65ed82a8393 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cond.h @@ -0,0 +1,219 @@ +#ifndef QURT_COND_H +#define QURT_COND_H +/** + @file qurt_cond.h + @brief Prototypes of kernel condition variable object API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup condition_variables_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** QuRT condition variable type. */ +typedef union { + /** @cond */ + unsigned long long raw; + struct { + unsigned int count; + unsigned int n_waiting; + unsigned int queue; + unsigned int reserved; + }X; + /** @endcond */ +} qurt_cond_t; + +/** @} */ /* end_addtogroup condition_variables_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_cond_init + Initializes a conditional variable object. + + @datatypes + #qurt_cond_t + + @param[out] cond Pointer to the initialized condition variable object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_init(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_destroy + Destroys the specified condition variable. + + @note1hang Conditions must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Conditions must not be destroyed while they are still in use. If this occurs, + the behavior of QuRT is undefined. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to destroy. + + @return + None. + + */ +/* ======================================================================*/ +void qurt_cond_destroy(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_signal + Signals a waiting thread that the specified condition is true. \n + + When a thread wishes to signal that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the signal condition operation. \n + -# Unlock the mutex. + + @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_signal(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_broadcast + Signals multiple waiting threads that the specified condition is true.\n + When a thread wishes to broadcast that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the broadcast condition operation. \n + -# Unlock the mutex.\n + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_broadcast(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable (suspends the thread and unlocks the mutex). + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t \n + #qurt_mutex_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait2 + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable, which suspends the thread and unlocks the mutex. + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @note1cont This is the same API as qurt_cond_wait(), use this version + when using mutexes of type #qurt_rmutex2_t. + + @datatypes + #qurt_cond_t \n + #qurt_rmutex2_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with the condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_COND_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_consts.h new file mode 100755 index 0000000000000..b1e35998e73b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_consts.h @@ -0,0 +1,315 @@ +#ifndef QURT_CONSTS_H +#define QURT_CONSTS_H + +/** + @file qurt_consts.h + @brief QuRT constants and definitions + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* Definitions of system events. System events suspend + a thread and put it into suspending_list. + The system event number is saved in CONTEXT::error::cause field + of the suspended thread. An event handler thread such as + page fault handler or system error handler can wake up the suspended + thread. + */ +#define QURT_EVENT_PAGEFAULT 0x1 /* Page fault event. */ +#define QURT_EVENT_SYSTEM_ERR 0x2 /* System error event. */ +#define QURT_EVENT_SUSPEND 0x3 +#define QURT_EVENT_PROCESS_EXIT 0x4 /* Process termination event.*/ + +#define QURT_SYSENV_MAX_THREADS_TYPE 1 /* Maximum threads object. */ +#define QURT_SYSENV_PROCNAME_TYPE 2 /* Process name object. */ +#define QURT_SYSENV_MAX_PI_PRIO_TYPE 3 /* Maximum pi priority object. */ +#define QURT_SYSENV_ARCH_REV_TYPE 4 /* Architecture version object. */ +#define QURT_SYSENV_APP_HEAP_TYPE 5 /* Application heap object. */ +#define QURT_SYSENV_REGION_ATTR_DEFAULT 7 /* Default region attributes. */ +#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE 8 /* Stack profile count type. */ +#define QURT_SYSENV_ISLAND_CONFIG_TYPE 9 /*island configuration check*/ +#define QURT_SYSENV_HTHREADS_TYPE 10 /* Active threads objec */ +#define QURT_SYSENV_CONFIG_IMAGE_START_LO 11 /* Config image start address for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_START_HI 12 /* Config Image start address for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_LO 13 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_HI 14 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_PLATPARAMS 15 /* Platformparams for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_SIZE 16 /* Config image Size for DTB parsing */ +#define QURT_SYSENV_L2_CACHE_LINE_SIZE 17 /*L2 cache line size*/ + +/* Get q6 regs */ +#define QURT_GET_SSR 1 +#define QURT_GET_CCR 2 +#define QURT_GET_CFGBASE 3 +#define QURT_GET_SYSCFG 4 +#define QURT_GET_REV 5 + + +/** @cond rest_reg_dist */ +/** @addtogroup performance_monitor_macros +@{ */ + +/* PMU */ +#define QURT_PMUCNT0 0 /**< */ +#define QURT_PMUCNT1 1 /**< */ +#define QURT_PMUCNT2 2 /**< */ +#define QURT_PMUCNT3 3 /**< */ +#define QURT_PMUCFG 4 /**< */ +#define QURT_PMUEVTCFG 5 /**< */ + +/* new since V55 */ +#define QURT_PMUCNT4 6 /**< */ +#define QURT_PMUCNT5 7 /**< */ +#define QURT_PMUCNT6 8 /**< */ +#define QURT_PMUCNT7 9 /**< */ +#define QURT_PMUEVTCFG1 10 /**< */ + +/* new since V61 */ +#define QURT_PMUSTID0 11 /**< */ +#define QURT_PMUSTID1 12 /**< */ + +#define QURT_PMUCNTSTID0 13 /**< */ +#define QURT_PMUCNTSTID1 14 /**< */ +#define QURT_PMUCNTSTID2 15 /**< */ +#define QURT_PMUCNTSTID3 16 /**< */ +#define QURT_PMUCNTSTID4 17 /**< */ +#define QURT_PMUCNTSTID5 18 /**< */ +#define QURT_PMUCNTSTID6 19 /**< */ +#define QURT_PMUCNTSTID7 20 /**< */ + +/** @} */ /* end_addtogroup performance_monitor_macros */ +/** @endcond */ + +/* + Power collapse operation +*/ +#define QURT_POWER_SHUTDOWN 0 /**< */ +#define QURT_TCXO_SHUTDOWN 1 /**< */ +#define QURT_POWER_CMD_PREPARE 0 /**< */ +#define QURT_POWER_CMD_PERFORM 1 /**< */ +#define QURT_POWER_CMD_EXIT 2 /**< */ +#define QURT_POWER_CMD_FAIL_EXIT 3 /**< */ +#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */ +#define QURT_POWER_CMD_PERFORM_SAVE_TCM 5 /**< */ +#define QURT_POWER_CMD_DEEP_SLEEP 6 /**< */ + + +/** @addtogroup thread_macros +@{ */ +#define QURT_MAX_HTHREAD_LIMIT 8U /**< Limit on the maximum number of hardware threads supported by QuRT for any + Hexagon version. Use this definition to define arrays, and so on, in + target independent code. */ +/** @} */ /* end_addtogroup thread_macros */ + +/** @cond internal_only */ +/** @addtogroup power_management_macros +@{ */ +/** + L2 cache retention mode +*/ +#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_L2RET QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */ +/** @} */ /* end_addtogroup power_management_macros */ +/** @endcond */ + +/* + QURT_system_state + Use for debugging the shutdown/startup process. + + State transition for cold boot: + QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT --> + QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE --> + QURT_CBOOT_ROOT_TASK_STARTED + + State transition for power collapse: + QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND --> + QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC --> + cache flush states (dependent on L2 retention config) + + State transition for warm boot: + QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB --> + QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT +*/ +#define QURT_PREPARE_SINGLE_MODE 1 /**< */ +#define QURT_PREPARE_END 2 /**< */ +#define QURT_PERFORM_IPEND 3 /**< */ +#define QURT_PERFORM_SAVE_ISDP 4 /**< */ +#define QURT_PERFORM_SAVE_PMU 5 /**< */ +#define QURT_PERFORM_SAVE_TLB 6 /**< */ +#define QURT_PERFORM_SWITCH_PC 7 /**< */ +#define QURT_PERFORM_EXIT 8 /**< */ +#define QURT_FLUSH_L1CACHE 9 /**< */ +#define QURT_FLUSH_L2CACHE 0xA /**< */ +#define QURT_FLUSH_CACHE_DONE 0xB /**< */ +#define QURT_SWITCH_PC_DONE 0xC /**< */ +#define QURT_BOOT_SETUP_ISDB 0xD /**< */ +#define QURT_WBOOT_INIT_TLB 0xE /**< */ +#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */ +#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */ +#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */ +#define QURT_CBOOT_BSP_INIT 0x12 /**< */ +#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */ +#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */ +#define QURT_CBOOT_END_OS_INIT 0x15 /**< */ +#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */ +#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */ +#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */ +#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */ +#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */ +#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */ +#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */ +#define QURT_WBOOT_DEBUG_L2_END 0x1D /**< */ +#define QURT_NMI_SAVE_L2VIC_COMPLETE 0x1E /**< */ +#define QURT_NMI_HANDLER_COMPLETE 0x1F /**< */ +#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */ +#define QURT_WBOOT_START 0x21 /**< */ +#define QURT_ENTER_ISLAND 0x22 /**< */ +#define QURT_EXIT_ISLAND 0x23 /**< */ +#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */ +#define QURT_ABNORMAL_RESET 0x25 /**< */ +/* + Thread attributes +*/ + +#define QURT_THREAD_ATTR_GP 0x00000002 /*< */ +#define QURT_THREAD_ATTR_UGP 0x00000003 /*< User general pointer (UGP)*/ +#define QURT_THREAD_ATTR_PREFETCH 0x00000004 /*< */ +#define QURT_THREAD_ATTR_TID 0x00000005 /*< */ +#define QURT_THREAD_ATTR_CACHE_PART 0x00000007 /*< */ +#define QURT_THREAD_ATTR_COPROCESSOR 0x00000008 /*< */ +#define QURT_THREAD_ATTR_GET_L2CACHE_PART 0x00000009 /*< */ +#define QURT_THREAD_ATTR_SET_FRML 0x0000000A /*< */ +#define QURT_THREAD_ATTR_STID_GET 0x0000000B /*< */ +#define QURT_THREAD_ATTR_STID_SET 0x0000000C /*< */ +#define QURT_THREAD_ATTR_AUTOSTACK 0x0000000D /*< */ +#define QURT_THREAD_ATTR_SYSTEM_THREAD 0x0000000E /*< */ +#define QURT_THREAD_ATTR_STID_SET2 0x0000000F /*< */ +#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */ +#define QURT_THREAD_ATTR_STID_GET2 0x00000011 /*< */ + +/** Cache operations*/ +#define QURT_DCCLEAN 0U /* Clean Dcache. */ +#define QURT_DCINV 1U /* Invalidate Dcache. */ +#define QURT_DCCLEANINV 2U /* Clean and invalidate Dcache. */ +#define QURT_ICINV 3U /* Invalidate Icache. */ +#define QURT_DUMP_DCTAGS 4U /* For testing purpose. */ +#define QURT_FLUSH_ALL 5U /* Flush entire L1 and L2 cache. */ +#define QURT_TABLE_FLUSH 6U /* Flush based on table of physical pages */ +#define QURT_CLEAN_INVALIDATE_ALL 7U /* Flush and invalidate entire L1 and L2 cache. */ +#define QURT_L2CACHE_LOCK_LINES 8U /* l2 cache lock lines */ +#define QURT_L2CACHE_UNLOCK_LINES 9U /* l2 cache unlock lines */ +#define QURT_CLEAN 10U /* Flush L1 and L2 cache */ +#define QURT_CLEAN_INVALIDATE 11U /* Flush and invalidate L1 and L2 cache. */ +#define QURT_CLEAN_INVALIDATE_L2 12U /* Flush and invalidate entire L2 cache. */ + +/**@ingroup chapter_prefined_symbols */ +/**@xreflabel{hdr:QURT_API_VERSION}*/ + + +/* Process state. */ +#define QURT_UPDATE_PROCESS_STATE 0 /**< */ +#define QURT_MP_INIT 1 /*< */ +#define QURT_MP_RUNNING 2 /*< */ +#define QURT_MP_STOPPED 3 /*< */ + +/* QuRT reset reason. */ +#define QURT_NORMAL_BOOT 0 /* Normal boot. */ +#define QURT_WARM_BOOT 1 /* Power collapse warm boot. */ +#define QURT_WARM_BOOT_L2_RETENTION 2 /* Power collapse with L2 retention warm boot. */ +#define QURT_WARM_BOOT_SAVE_TCM 3 /* Power collapse with saving TCM. */ +#define QURT_QUICK_BOOT 4 /* Deep sleep. */ + +/* QuRT Wait for Idle command */ +#define QURT_WAIT_FOR_IDLE_DISABLE 0 /*< */ +#define QURT_WAIT_FOR_IDLE_ENABLE 1 /*< */ +#define QURT_WAIT_FOR_IDLE 2 /*< */ +#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */ + +/*QuRT island exit stages */ +#define QURT_ISLAND_EXIT_STAGE1 1 /*< */ +#define QURT_ISLAND_EXIT_STAGE2 2 /*< */ + +#define QURT_MAX_NAME_LEN 64 /*< */ + +#define MAX_POOL_RANGES 16 /*< */ + +/* key definitions for debug thread info */ +//#define MAX_TCB_KEY 40 //whatever is a good number or makes debug thread structure be 1K +#define KEY_SCHDULER_STATE 1 /*< */ +#define KEY_PRIORITY 2 /*< */ +#define KEY_PRIORITY_ORIG 3 /*< */ +#define KEY_STACK_BOTTOM 4 // Currently not populated +#define KEY_STACK_TOP 5 // Currently not populated +#define KEY_HVX_STATE 6 /*< */ +#define KEY_FUTEX_OBJECT 7 /*< */ +#define KEY_THREAD_ID 8 /*< */ +#define KEY_PROFILE_CYCLE_LO 9 // Currently not populated +#define KEY_PROFILE_CYCLE_HI 10 // Currently not populated +#define KEY_ERROR_ADDRESS 11 // This holds the BADVA +#define KEY_ERROR_CAUSE 12 // This is the same as QURT_error_info.cause +#define KEY_ERROR_CAUSE2 13 // This is the same as QURT_error_info.cause2 +#define KEY_ERROR_SSR 14 /*< Holds the SSR value */ +#define QURT_RESERVED -1 + +/* VTLB method IDs. */ +#define QURT_VTLB_ENTRY_CREATE 0U +#define QURT_VTLB_ENTRY_DELETE 1U +#define QURT_VTLB_ENTRY_READ 2U +#define QURT_VTLB_ENTRY_WRITE 3U +#define QURT_VTLB_ENTRY_PROBE 4U +#define QURT_VTLB_ENTRY_SPLIT 5U +#define QURT_VTLB_ENTRY_MERGE 6U +#define QURT_VTLB_ENTRY_STATISTICS 7U +#define QURT_VTLB_ENTRY_SET_SPECIAL 8U +#define QURT_VTLB_QUEUE_PPAGE 9U +#define QURT_VTLB_RECLAIM_STACK_PAGES 10U +#define QURT_VTLB_ASID_SET_STATE_FAST 11U +#define QURT_VTLB_ASID_SET_STATE 12U +#define QURT_VTLB_ENTRY_SET_EXTENSION 13U +#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U + +/* VTCM window access control HWIO programming. */ +#define QURT_VTCM_WINDOW_ENABLE 1U +#define QURT_VTCM_WINDOW_DISABLE 0U +#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT 0xFFFU +#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT 0U + +/** @cond */ +/* ETM source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< Memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< Memory source of SAC* is data. */ + +/* ETM PID status flags */ +#define QURT_ETM_NO_PID 0xFFFFFFFF /**< No PID is selected. */ +/** @endcond */ + +/* execution context */ +#define QURT_CTX_USER 1 +#define QURT_CTX_GUEST 2 + +/* Profiling STID */ +#define QURT_STID_DEFAULT 0U + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cycles.h new file mode 100755 index 0000000000000..b599493f5d563 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cycles.h @@ -0,0 +1,301 @@ + +#ifndef QURT_CYCLES_H +#define QURT_CYCLES_H 1 +/** + @file qurt_cycles.h + Prototypes of kernel pcycle API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ + +/**@ingroup func_qurt_profile_reset_idle_pcycles + @xreflabel{hdr:qurt_profile_reset_idle_pcycles} + Sets the per-hardware-thread idle cycle counts to zero. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_idle_pcycles (void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_thread_pcycles + @xreflabel{hdr:qurt_profile_get_thread_pcycles} + Gets the count of the running processor cycles for the current thread.\n + Returns the current running processor cycle count for the current QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @return + Integer -- Running processor cycle count for current thread. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_profile_get_thread_pcycles(void); + + +/*======================================================================*/ +/**@ingroup func_qurt_get_core_pcycles + @xreflabel{hdr:qurt_get_core_pcycles} + Gets the count of core processor cycles executed.\n + Returns the current number of running processor cycles executed since the Hexagon + processor was last reset. + + This value is based on the hardware core clock, which varies in speed according to the + processor clock frequency. + + @note1hang Because the hardware core clock stops running when the processor shuts + down (due to all of the hardware threads being idle), treat the cycle values returned + by this operation as relative rather than absolute. + + @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version. + + @return + Integer -- Current count of core processor cycles. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_get_core_pcycles(void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles + + @deprecated use #qurt_profile_get_idle_pcycles2 instead + + Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use + #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. + + This operation accepts a pointer to a user-defined array, and writes to the array the current + idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling is enabled or not, + and resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be a minimum of the number of hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_idle_pcycles (unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles2 + Gets the current idle processor cycle counts for maximum available hardware threads. + + This operation accepts a pointer to a user-defined array with length in bytes, and writes + to the array the current idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling enable status, and + resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be equivalent to the number of hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, + it returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles + + @deprecated use #qurt_profile_get_threadid_pcycles2 instead + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for a maximum of 6 hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Valid thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be a minimum of the number of + hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles2 + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for maximum available hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be equivalent to the number of + hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, it + returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long *pcycles, unsigned int length_in_bytes); + + +/*======================================================================*/ +/**@ingroup func_qurt_profile_reset_threadid_pcycles + @xreflabel{hdr:qurt_profile_reset_threadid_pcycles} + Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread. + + @param[in] thread_id Thread identifier. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_threadid_pcycles (int thread_id); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_enable + @xreflabel{hdr:qurt_profile_enable} + Enables profiling.\n + Enables or disables cycle counting of the running and idle processor cycles. + Profiling is disabled by default. \n + + @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be + done explicitly by calling the reset operations before starting cycle counting. + Cycle counting starts from the instant of it was enabled using this API, and + halts on profiling disable. + + @param[in] enable Profiling. Values: \n + - 0 -- Disable profiling \n + - 1 -- Enable profiling @tablebulletend + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_enable (int enable); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_pcycles + @xreflabel{hdr:qurt_get_hthread_pcycles} + Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values are 1 through . + + + @return + Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed + from reset to current point of execution when n threads are in run mode + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_pcycles(int n); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_commits + @xreflabel{hdr:qurt_get_hthread_commits} + Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values: 1 through . + + @return + Value read from the GCOMMIT_nT register. This value indicates the total number of packets + committed from reset to current point of execution when n threads are in run mode. + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_commits(int n); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_devtree.h new file mode 100755 index 0000000000000..4adee45bb44a2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_devtree.h @@ -0,0 +1,161 @@ +#ifndef QURT_DEVTREE_H +#define QURT_DEVTREE_H +/** + @file qurt_devtree.h + @brief Prototypes and structures for device tree aware QuRT library function. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +*/ +/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def. + callback is not used here, so define NULL here to avoid including the world*/ +#ifndef NULL +#define NULL ((void *) 0) +#endif + +#include "libfdt.h" +#include "DTBExtnLib.h" +#include "qurt_qdi_ext.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_BLOB_ID (-1) +#define DEFAULT_BLOB_ID 0 + +/** QURT Device Tree Mapping Macros */ +#define QURT_DT_MAPPING_FAILED (-1) +#define QURT_DT_FLAG_ISLAND 0x1 +#define QURT_DT_FLAG_PHYSADDR 0x2 + +/** Device Tree type for Root PD Device tree. +    Root PD Device Tree will typically describe the hardware in the subsystem. +    This is the /soc portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_ROOT 0 + +/** Device Tree type for Local Device tree. +    Local Device Tree will typically contain the software settings. +    This is the /sw portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_LOCAL 1 + +int qurt_devtree_init(void); + +/**@ingroup func_qurt_dt_mapping_create + Creates a memory mapping from the specified property of the specified device + tree node. Returns virtual addresses and sizes. + + @param[in] offset Device tree node offset. + @param[in] flags Flags to configure memory. Overloaded as property + index if reg_name is NULL. + @param[in] reg_name Identifies property to use for mapping, should + resemble a region. + @param[out] vaddr Return pointer for the virtual region address. + @param[out] size Return pointer for the virtual region size. + + @return + Result code indicating success or failure \n +*/ +int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, + unsigned long long *vaddr, unsigned long long *size); + +/**@ingroup func_qurt_dt_mapping_create2 + + Creates a memory mapping from the specified property of the specified device + tree node. + + Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). + + @param[in] devtreeNode Device Tree node + + @param[in] dt_map_flags Flags to configure memory mapping and are reserved for future purpose. + (0) - Default value assumes details from DT node are phys address, size. + QURT_DT_FLAG_ISLAND + + NOTE: The PA needs to be added to corresponding island spec to create an island mapping + + @param[in] regionName NULL or name of index in range to return, should + resemble a region. Ex.reg-names = "base", "rx", "tx"; + + @param[in] regionIdx Index of range to return. Ex reg = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >; + + NOTE: If client specifies both re_name & regionIdx. The precedence of + region name is taken over and region index is ignored. + + @param[in] dt_map_perm Mapping access permissions(R/W), + QURT_PERM_READ + QURT_PERM_WRITE + + @param[in] cache_attr QuRT cache mode type's : + QURT_MEM_CACHE_DEVICE + QURT_MEM_CACHE_WRITEBACK + Other required cache type enums in qurt_types.h can also be passed. + + NOTE: No default value for cache & perm is present. + Client always needs to pass any of defined the flags. + + @param[out] vaddr Return pointer to the variable that holds the virtual address + @param[out] size Return pointer for the virtual region size. + + @return + #QURT_EOK Success indicating mapping created properly. + #QURT_DT_MAPPING_FAILED Failed to create mapping. + #QURT_EINVALID Mismatch in the architecture. + + else FdtLib or thirdparty error code. + +*/ +int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, + char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size); + +/**@ingroup func_qurt_dt_isr_register + Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. + The interrupt defined in the specified device tree node is enabled when this function returns success. + + @datatypes + #qurt_thread_t \n + #fdt_node_handle + + @param[in] dt_node Device tree node that specifies the interrupt property. + @param[in] dt_int_index Index of the specific interrupt to use within the device tree node structure. + Specify either this or int_name, use -1 if string is used. + @param[in] dt_int_name Name of the specific interrupt to use within the device tree node structure. + Either this or int_index should be specified, use NULL if index is used + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2(). + @param[in] prio Priority of the ISR, defined by qurt_isr_register2(). + @param[in] flags Defines ACK type. Values : \n + #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the kernel. + #QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + Defined by qurt_isr_register2(). + @param[in] isr ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2(). + @param[in] arg First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2(). + + @return + #QURT_EOK -- Successfully registered the ISR for the interrupt \n + #QURT_EINT -- Interrupt not configured \n + #QURT_EINVALID -- Invalid thread ID \n + #QURT_EDISABLED -- The feature is disabled \n + #QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Create the thread ID qurt_isr_create(). + ISR registration completed with qurt_isr_register2(). + */ +int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, + unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_dt_blob_id_get + Returns the Blob ID for the Blob type passed. + The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs. + + @param[in] blob_type  Blob type to look up. + @return Blob ID for the passed Blob Type. +*/ +int qurt_dt_blob_id_get(unsigned int blob_type); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ecc.h new file mode 100755 index 0000000000000..09312684e99af --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ecc.h @@ -0,0 +1,168 @@ +#ifndef QURT_ECC_H +#define QURT_ECC_H + + +/*===================================================================== + + @file qurt_ecc.h + @brief Prototypes of QuRT memory ECC API functions + + Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup exception_handling_types +@{ */ +// ECC memory definition +typedef enum { + QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */ + QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/ + QURT_ECC_MEM_L2_CACHE = 2, /**< ECC memory L2 Cache.*/ + QURT_ECC_MEM_VTCM = 3 /**< ECC memory VTCM.*/ +} qurt_ecc_memory_t; +/** @} */ /* end_addtogroup exception_handling_types */ + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup exception_handling_macros +@{ */ + +#define QURT_ECC_ERR_DETECTED_STATUS 0 /**< ECC error detected. */ +#define QURT_ECC_ERR_TYPE 1 /**< ECC error type.*/ +// ECC status type + +#define QURT_ECC_CORRECTABLE_COUNT (1<<0) /**< ECC correctable count.*/ +#define QURT_ECC_UNCORRECTABLE_COUNT (1<<1) /**< ECC uncorrectable count.*/ +#define QURT_ECC_REGION_LOGGING (1<<2) /**< ECC region logging.*/ +// ECC enable/disable definition + +#define QURT_ECC_PROTECTION_DISABLE (0<<0) /**< Bit 0. */ +#define QURT_ECC_PROTECTION_ENABLE (1<<0) /**< Bit 0. */ +/** @} */ /* end_addtogroup exception_handling_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_ecc_enable + Enables or disables ECC protection on a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] enable Set to one of the following values: + - #QURT_ECC_PROTECTION_ENABLE + - #QURT_ECC_PROTECTION_DISABLE @tablebulletend + + @return + - #QURT_EOK -- ECC enabling or disabling setup is performed successfully + - Others -- Failure + + @dependencies + None. + */ +int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable ); + + +/**@ingroup func_qurt_ecc_get_error_status + Gets ECC error status for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following: + - #QURT_ECC_ERR_DETECTED_STATUS + - #QURT_ECC_ERR_TYPE @tablebulletend + + @return + Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS: + - 0 -- No error detected \n + - 1 -- At least one error detected \n + Returns the following when the type is #QURT_ECC_ERR_TYPE: \n + - 0 through 1 -- Correctable error \n + - 2 -- Uncorrectable error + + @dependencies + None. + */ +int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_get_error_count + Gets the ECC error count for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values:\n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT @tablebulletend + + @return + Error count for the specified error type. + + @dependencies + None. + */ +int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_clear_error_count + Clears ECC error count or region logging for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: \n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one or multiple OR'ed of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT \n + - #QURT_ECC_REGION_LOGGING @tablebulletend + + @return + #QURT_EOK -- Error count successfully cleared \n + Others -- Failure at clearing the error count + + @dependencies + None. + */ +int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type ); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ECC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_error.h new file mode 100755 index 0000000000000..f4666b396c378 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_error.h @@ -0,0 +1,149 @@ +#ifndef QURT_ERROR_H +#define QURT_ERROR_H + +/** + @file qurt_error.h + Error results- QURT defines a set of standard symbols for the error result values. This file lists the + symbols and their corresponding values. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021-2022 , 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ +#include "qurt_except.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup chapter_error +@{ */ + +/*===================================================================== +Constants and macros +======================================================================*/ +#define QURT_EOK 0 /**< Operation successfully performed. */ +#define QURT_EVAL 1 /**< Wrong values for the parameters. The specified page does not exist. */ +#define QURT_EMEM 2 /**< Not enough memory to perform the operation.*/ + +#define QURT_EINVALID 4 /**< Invalid argument value; invalid key. */ +/** @cond */ +#define QURT_EUNKNOWN 6 /**< Defined but never used in QuRT. */ +#define QURT_ENOMSGS 7 /**< Message queue is empty. */ +#define QURT_EBADF 9 /**< Bad message queue descriptor. */ +/** @endcond */ +#define QURT_EFAILED 12 /**< Operation failed. */ + +#define QURT_ENOTALLOWED 13 /**< Operation not allowed. */ + +/** @cond */ +#define QURT_EDUPCLSID 14 /*< Duplicate class ID. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOREGISTERED 20 /**< No registered interrupts.*/ +/** @endcond */ + + +/** @cond */ +#define QURT_EISDB 21 /*< Power collapse failed due to ISDB being enabled. */ +#define QURT_ESTM 22 /*< Power collapse failed in a Single-threaded mode check. */ +/** @endcond */ + + +/** @cond rest_reg_dist */ +#define QURT_ETLSAVAIL 23 /**< No free TLS key is available. */ +#define QURT_ETLSENTRY 24 /**< TLS key is not already free. */ +/** @endcond */ + +#define QURT_EINT 26 /**< Invalid interrupt number (not registered). */ +/** @cond rest_reg_dist */ +#define QURT_ESIG 27 /**< Invalid signal bitmask (cannot set more than one signal at a time). */ +/** @endcond */ + +/** @cond */ +#define QURT_EHEAP 28 /**< No heap space is available. */ +#define QURT_ENOSPC 28 /**< No space to create another queue in the system. */ +#define QURT_EMEMMAP 29 /**< Physical address layout is not supported by the kernel. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOTHREAD 30 /**< Thread no longer exists. */ +/** @endcond */ +/** @cond */ +#define QURT_EL2CACHE 31 /**< L2cachable is not supported in kernel invalidate/cleaninv. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_EALIGN 32 /**< Not aligned. */ +#define QURT_EDEREGISTERED 33 /**< Interrupt is already deregistered.*/ +/** @endcond */ + +/** @cond internal_only */ + +#define QURT_ETLBCREATESIZE 34 /**< TLB create error -- Incorrect size.*/ +#define QURT_ETLBCREATEUNALIGNED 35 /**< TLB create error -- Unaligned address.*/ +/** @endcond */ +/** @cond rest_reg_dist*/ +#define QURT_EEXISTS 35 /**< File or message queue already exists. */ +#define QURT_ENAMETOOLONG 36 /**< Name too long for message queue creation. */ +#define QURT_EPRIVILEGE 36 /**< Caller does not have privilege for this operation.*/ + +#define QURT_ECANCEL 37 /**< A cancellable request was canceled because the associated process was asked to exit.*/ +/** @endcond */ + +/** @cond */ +#define QURT_EISLANDTRAP 38 /*< Unsupported TRAP is called in Island mode.*/ + +#define QURT_ERMUTEXUNLOCKNONHOLDER 39 /*< Rmutex unlock by a non-holder.*/ +#define QURT_ERMUTEXUNLOCKFATAL 40 /*< Rmutex unlock error, all except the non-holder error.*/ +#define QURT_EMUTEXUNLOCKNONHOLDER 41 /*< Mutex unlock by a non-holder.*/ +#define QURT_EMUTEXUNLOCKFATAL 42 /*< Mutex unlock error, all except the non-holder error.*/ +#define QURT_EINVALIDPOWERCOLLAPSE 43 /*< Invalid power collapse mode requested. */ +/** @endcond */ +#define QURT_EISLANDUSEREXIT 44 /**< User call has resulted in island exit.*/ +#define QURT_ENOISLANDENTRY 45 /**< Island mode had not yet been entered.*/ +#define QURT_EISLANDINVALIDINT 46 /**< Exited Island mode due to an invalid island interrupt.*/ +/** @cond rest_reg_dist */ +#define QURT_ETIMEDOUT 47 /**< Operation timed-out. */ +#define QURT_EALREADY 48 /**< Operation already in progress. */ +/** @endcond */ + +#define QURT_ERETRY 49 /*< Retry the operation. */ +#define QURT_EDISABLED 50 /*< Resource disabled. */ +#define QURT_EDUPLICATE 51 /*< Duplicate resource. */ +#define QURT_EBADR 53 /*< Invalid request descriptor. */ +#define QURT_ETLB 54 /*< Exceeded maximum allowed TLBs. */ +#define QURT_ENOTSUPPORTED 55 /*< Operation not supported. */ +/** @cond rest_reg_dist */ +#define QURT_ENORESOURCE 56 /**< No resource. */ +/** @endcond */ + +#define QURT_EDTINIT 57 /**< Problem with device tree intialization. */ +#define QURT_EBUFLOCK 58 /*< Buffer lock failed because it was already locked many times. */ +#define QURT_ELOCKED 59 /**< Current operation failed as the buffer is locked. */ +#define QURT_EMSGSIZE 90 /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */ + + +#define QURT_ENOTCONFIGURED 91 /*< Interrupt is NOT configured. */ + +#define QURT_EBANDWIDTHLIMIT 92 /*< Message queue send exceed the bandwidth limit. */ + +#define QURT_ECFIVIOLATION 93 /*< CFI violation detected. */ + +#define QURT_EDESTROY 94 /**< A destroy request was made to waiting threads.*/ + +#define QURT_EHMXNOTAVAIL 95 /**< HMX is not available to target thread.*/ +#define QURT_EHMXNOTDETACHABLE 96 /**< HMX is not detachable from target thread.*/ + +#define QURT_EFATAL -1 /**< Fatal error. */ + +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ERROR_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_event.h new file mode 100755 index 0000000000000..987f0fe79f227 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_event.h @@ -0,0 +1,452 @@ +#ifndef QURT_EVENT_H +#define QURT_EVENT_H +/** + @file qurt_event.h + @brief Prototypes of kernel event API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "qurt_consts.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * System environment object type. + */ +/**@addtogroup sys_env_types +@{ */ +/** QuRT swap pool information type. */ +typedef struct qurt_sysenv_swap_pools { + /** @cond */ + unsigned int spoolsize; /* Swap pool size.*/ + unsigned int spooladdr; /* Swap pool start address.*/ + /** @endcond */ +}qurt_sysenv_swap_pools_t; + +/**QuRT application heap information type. */ +typedef struct qurt_sysenv_app_heap { + /** @cond */ + unsigned int heap_base; /* Heap base address.*/ + unsigned int heap_limit; /* Heap end address.*/ + /** @endcond */ +} qurt_sysenv_app_heap_t ; + +/** QuRT architecture version information type. */ +typedef struct qurt_sysenv_arch_version { + /** @cond */ + unsigned int arch_version; /*Architecture version.*/ + /** @endcond */ +}qurt_arch_version_t; + +/** QuRT maximum hardware threads information type. */ +typedef struct qurt_sysenv_max_hthreads { + /** @cond */ + unsigned int max_hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_max_hthreads_t; + +/** QuRT active hardware threads information type. */ +typedef struct qurt_sysenv_hthreads { + /** @cond */ + unsigned int hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_hthreads_t; + +/** QuRT maximum pi priority information type. */ +typedef struct qurt_sysenv_max_pi_prio { + /** @cond */ + unsigned int max_pi_prio; /*Maximum pi priority.*/ + /** @endcond */ +}qurt_sysenv_max_pi_prio_t; + +/** QuRT process name information type. */ +typedef struct qurt_sysenv_procname { + /** @cond */ + union { + unsigned int asid; /*Address space ID.*/ + unsigned int pid; /*Process ID.*/ + }; + char name[QURT_MAX_NAME_LEN]; /* Process name.*/ + /** @endcond */ +}qurt_sysenv_procname_t; + +/** QuRT stack profile count information type. */ +typedef struct qurt_sysenv_stack_profile_count { + /** @cond */ + unsigned int count; /*Stack profile count for usage.*/ + unsigned int count_watermark; /*Stack profile count for watermark.*/ + /** @endcond */ +}qurt_sysenv_stack_profile_count_t; + +/** + QuRT system error event type. + */ +typedef struct _qurt_sysevent_error_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + } qurt_sysevent_error_t ; + +typedef struct _qurt_sysevent_error_1_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + unsigned int fkey; /**< Framekey.*/ + unsigned int reserved1; /**< Reserved.*/ + unsigned int reserved2; /**< Reserved.*/ + unsigned int reserved3; /**< Reserved.*/ + } qurt_sysevent_error_1_t ; + +/** QuRT page fault error event information type. */ +typedef struct qurt_sysevent_pagefault { + qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */ + unsigned int fault_addr; /**< Accessed address that caused the page fault. */ + unsigned int ssr_cause; /**< SSR cause code for the page fault. */ +} qurt_sysevent_pagefault_t ; +/** @} */ /* @endaddtogroup sys_env_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/*======================================================================*/ +/** + Gets the environment swap pool 0 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools ); + +/* + Gets the environment swap pool 1 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools ); + +/**@ingroup func_qurt_sysenv_get_app_heap + Gets information on the program heap from the kernel. + + @datatypes + #qurt_sysenv_app_heap_t + + @param[out] aheap Pointer to information on the program heap. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap ); + +/**@ingroup func_qurt_sysenv_get_arch_version + Gets the Hexagon processor architecture version from the kernel. + + @datatypes + #qurt_arch_version_t + + @param[out] vers Pointer to the Hexagon processor architecture version. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter + + @dependencies + None. +*/ +int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers); + +/**@ingroup func_qurt_sysenv_get_max_hw_threads + Gets the maximum number of hardware threads supported in the Hexagon processor. + The API includes the disabled hardware threads to reflect the maximum + hardware thread count. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, only HW0 and HW2 are initialized by QuRT. + HW1 and HW3 are not used at all. Under such a scenario, + qurt_sysenv_get_max_hw_threads() still returns four. + + @datatypes + #qurt_sysenv_max_hthreads_t + + @param[out] mhwt Pointer to the maximum number of hardware threads supported in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_hw_threads + Gets the number of hardware threads initialized by QuRT in Hexagon processor. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, QuRT only initializes HW0 and HW2. + HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2. + + @datatypes + #qurt_sysenv_hthreads_t + + @param[out] mhwt Pointer to the number of hardware threads active in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_max_pi_prio + Gets the maximum priority inheritance mutex priority from the kernel. + + @datatypes + #qurt_sysenv_max_pi_prio_t + + @param[out] mpip Pointer to the maximum priority inheritance mutex priority. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip ); + +/**@ingroup func_qurt_sysenv_get_process_name2 + Gets information on the system environment process names based on the client_handle argument. + + @datatypes + #qurt_sysenv_procname_t + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_process_name + Gets information on the system environment process names from the kernel. + + @datatypes + #qurt_sysenv_procname_t + + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_stack_profile_count + Gets information on the stack profile count from the kernel. + + @datatypes + #qurt_sysenv_stack_profile_count_t + + @param[out] count Pointer to information on the stack profile count. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count ); + +/**@ingroup func_qurt_exception_wait + Registers the program exception handler. + This function assigns the current thread as the QuRT program exception handler and suspends the + thread until a program exception occurs. + + When a program exception occurs, the thread is awakened with error information + assigned to the parameters of this operation. + + @note1hang If no program exception handler is registered, or if the registered handler + calls exit, QuRT raises a kernel exception. + If a thread runs in Supervisor mode, any errors are treated as kernel + exceptions. + + @param[out] ip Pointer to the instruction memory address where the exception occurred. + @param[out] sp Stack pointer. + @param[out] badva Pointer to the virtual data address where the exception occurred. + @param[out] cause Pointer to the QuRT error result code. + + @return + Registry status: \n + Thread identifier -- Handler successfully registered. \n + #QURT_EFATAL -- Registration failed. + + @dependencies + None. +*/ +unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp, + unsigned int *badva, unsigned int *cause); + +unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err); + +/**@ingroup func_qurt_exception_wait3 + Registers the current thread as the QuRT program exception handler, and suspends the thread until a + program exception occurs. + When a program exception occurs, the thread is awakened with error information assigned to the specified + error event record. + If a program exception is raised when no handler is registered (or when a handler is registered, but it calls + exit), the exception is treated as fatal.\n + @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n + @note1cont This function differs from qurt_exception_wait() by returning the error information in a data + structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR). + + @param[out] sys_err Pointer to the qurt_sysevent_error_1_t type structure. + @param[in] sys_err_size Size of the qurt_sysevent_error_1_t structure. + + @return + Registry status: \n + - #QURT_EFATAL -- Failure. \n + - Thread ID -- Success. + + @dependencies + None. +*/ + +unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size); + +/**@ingroup func_qurt_exception_raise_nonfatal + Raises a nonfatal program exception in the QuRT program system. + + For more information on program exceptions, see Section @xref{dox:exception_handling}. + + This operation never returns -- the program exception handler is assumed to perform all + exception handling before terminating or reloading the QuRT program system. + + @note1hang The C library function abort() calls this operation to indicate software + errors. + + @param[in] error QuRT error result code (Section @xref{dox:error_results}). + + @return + Integer -- Unused. + + @dependencies + None. +*/ +int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn)); + + +/**@ingroup func_qurt_exception_raise_fatal + Raises a fatal program exception in the QuRT system. + + Fatal program exceptions terminate the execution of the QuRT system without invoking + the program exception handler. + + For more information on fatal program exceptions, see Section @xref{dox:exception_handling}. + + This operation always returns, so the calling program can perform the necessary shutdown + operations (data logging, on so on). + + @note1hang Context switches do not work after this operation has been called. + + @return + None. + + @dependencies + None. +*/ +void qurt_exception_raise_fatal (void); + +unsigned int qurt_enable_floating_point_exception(unsigned int mask); + +/**@ingroup func_qurt_exception_enable_fp_exceptions + Enables the specified floating point exceptions as QuRT program exceptions. + + The exceptions are enabled by setting the corresponding bits in the Hexagon + control user status register (USR). + + The mask argument specifies a mask value identifying the individual floating + point exceptions to set. The exceptions are represented as defined symbols + that map into bits 0 through 31 of the 32-bit flag value. + Multiple floating point exceptions are specified by OR'ing together the individual + exception symbols.\n + @note1hang This function must be called before performing any floating point operations. + + @param[in] mask Floating point exception types. Values: \n + - #QURT_FP_EXCEPTION_ALL \n + - #QURT_FP_EXCEPTION_INEXACT \n + - #QURT_FP_EXCEPTION_UNDERFLOW \n + - #QURT_FP_EXCEPTION_OVERFLOW \n + - #QURT_FP_EXCEPTION_DIVIDE0 \n + - #QURT_FP_EXCEPTION_INVALID @tablebulletend + + @return + Updated contents of the USR. + + @dependencies + None. +*/ + +static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask) +{ + return qurt_enable_floating_point_exception(mask); +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EVENT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_except.h new file mode 100755 index 0000000000000..e1684c80e3d50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_except.h @@ -0,0 +1,185 @@ +#ifndef QURT_EXCEPT_H +#define QURT_EXCEPT_H + +/** + @file qurt_except.h + @brief Defines Cause and Cause2 codes for error-handling. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. + + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + QuRT supports error handling to handle CPU detected exceptions and software errors. + QuRT treats all errors as either fatal errors or nonfatal errors. + + @section sec1 Fatal errors + All supervisor mode exceptions are treated as fatal errors. + If a registered exception handler calls qurt_exit(), it is treated as a fatal error. + Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. + All hardware threads are eventually stopped and the cache is flushed. + NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n + + @subsection subsection1 Debugging fatal errors + - QURT_error_info.status.status -- Indicates that an error occured. + - QURT_error_info.status.cause -- Cause code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.cause2 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.fatal -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered. + - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error. + - QURT_error_info.global_regs -- Contains the values of the global registers of Q6 + - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error. + + + + @subsection subsection2 Debugging nonfatal errors + - QURT_error_info.user_errors -- All user errors are logged here. + - QURT_error_info.user_errors.counter -- Index to last logged error. + - QURT_error_info.user_errors.entry[0...counter] -- Structure for logged error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb -- TCB for the user error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID. + - QURT_error_info.user_errors.entry[0...counter].error_code -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below. + - QURT_error_info.user_errors.entry[0...counter].hw_thread -- Hardware thread ID for error. + - QURT_error_info.user_errors.entry[0...counter].pcycle -- Pcycle for error. + +@note + Important usage note: + Cause and Cause2 are error codes to distinguish multiple errors. + SSR and BADAVA are inconclusive without the vector number. + All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code. + Hence the system can have up to 255 * 255 unique error codes. + The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) ) + Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes. + SSR cause codes are defined in Hexagon reference manual. + All possible combinations are listed below. +*/ +/** @addtogroup chapter_error +@{ */ +/* cause - error type - 8-bits*/ +#define QURT_EXCEPT_PRECISE 0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/ +#define QURT_EXCEPT_NMI 0x02U /**< NMI occurred; Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS 0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_RSVD_VECTOR 0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */ +#define QURT_EXCEPT_ASSERT 0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below. */ +#define QURT_EXCEPT_BADTRAP 0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */ +#define QURT_EXCEPT_UNDEF_TRAP1 0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */ +#define QURT_EXCEPT_EXIT 0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */ +#define QURT_EXCEPT_TLBMISS_X 0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */ +#define QURT_EXCEPT_STOPPED 0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */ +#define QURT_EXCEPT_FATAL_EXIT 0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */ +#define QURT_EXCEPT_INVALID_INT 0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */ +#define QURT_EXCEPT_FLOATING_POINT 0x0EU /**< Kernel received an floating point error. Cause2 is not defined. */ +#define QURT_EXCEPT_DBG_SINGLE_STEP 0x0FU /**< Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS_RW_ISLAND 0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */ +#define QURT_EXCEPT_TLBMISS_X_ISLAND 0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_SYNTHETIC_FAULT 0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */ +#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */ +#define QURT_EXCEPT_UNDEF_TRAP0 0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */ +#define QURT_EXCEPT_PRECISE_DMA_ERROR 0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */ + +#define QURT_ECODE_UPPER_LIBC (0U << 16) /**< Upper 16 bits is 0 for libc. */ +#define QURT_ECODE_UPPER_QURT (0U << 16) /**< Upper 16 bits is 0 for QuRT. */ +#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16) /**< Upper 16 bits is 2 for error service. */ +/** @cond */ +#define QURT_ECODE_ISLAND_INVALID_QDI 3U /**< Passing invalid QDI method in island. */ +/** @endcond */ + +/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */ +#define QURT_SYNTH_ERR 0x01U /**< */ +#define QURT_SYNTH_INVALID_OP 0x02U /**< */ +#define QURT_SYNTH_DATA_ALIGNMENT_FAULT 0x03U /**< */ +#define QURT_SYNTH_FUTEX_INUSE 0x04U /**< */ +#define QURT_SYNTH_FUTEX_BOGUS 0x05U /**< */ +#define QURT_SYNTH_FUTEX_ISLAND 0x06U /**< */ +#define QURT_SYNTH_FUTEX_DESTROYED 0x07U /**< */ +#define QURT_SYNTH_PRIVILEGE_ERR 0x08U /**< */ + +/* Cause2 - Abort cause reason - 8 bits */ +/* ERR_ASSERT cause */ +#define QURT_ABORT_FUTEX_WAKE_MULTIPLE 0x01U /**< Abort cause - futex wake multiple. */ +#define QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE 0x02U /**< Abort cause - thread waiting to wake up in Single Threaded mode. */ +#define QURT_ABORT_TCXO_SHUTDOWN_NOEXIT 0x03U /**< Abort cause - call TCXO shutdown without exit. */ +#define QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL 0x04U /**< Abort cause - futex allocation queue failure - QURTK_futexhash_lifo empty. */ +#define QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT 0x05U /**< Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */ +#define QURT_ABORT_THREAD_SCHEDULE_SANITY 0x06U /**< Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */ +#define QURT_ABORT_REMAP 0x07U /**< Remap in the page table; the correct behavior must remove mapping if necessary. */ +#define QURT_ABORT_NOMAP 0x08U /**< No mapping in page table when removing a user mapping. */ +#define QURT_ABORT_OUT_OF_SPACES 0x09U +#define QURT_ABORT_INVALID_MEM_MAPPING_TYPE 0x0AU /**< Invalid memory mapping type when creating qmemory. */ +#define QURT_ABORT_NOPOOL 0x0BU /**< No pool available to attach. */ +#define QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM 0x0CU /**< Cannot allocate more futex waiting queue. */ +#define QURT_ABORT_ARG_ERROR 0x0DU +#define QURT_ABORT_ASSERT 0x0EU /**< Assert abort. */ +#define QURT_ABORT_FATAL 0x0FU /**< Fatal error; must never occur. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE 0x10U /**< Abort cause - invalid queue ID in futex resume. */ +#define QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE 0x11U /**< Abort cause - invalid queue ID in futex wait. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX 0x12U /**< Abort cause - invalid futex object in hashtable. */ +#define QURT_ABORT_NO_ERHNDLR 0x13U /**< No registered error handler. */ +#define QURT_ABORT_ERR_REAPER 0x14U /**< Exception in the reaper thread. */ +#define QURT_ABORT_FREEZE_UNKNOWN_CAUSE 0x15U /**< Abort in thread freeze operation. */ +#define QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE 0x16U /**< During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */ +#define QURT_ABORT_ERR_ISLAND_EXP_HANDLER 0x17U /**< Exception in Island exception handler task. */ +#define QURT_ABORT_L2_TAG_DATA_CHECK_FAIL 0x18U /**< Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */ +#define QURT_ABORT_ERR_SECURE_PROCESS 0x19U /**< Abort error in secure process. */ +#define QURT_ABORT_ERR_EXP_HANDLER 0x20U /**< No exception handler, or the handler caused an exception. */ +#define QURT_ABORT_ERR_NO_PCB 0x21U /**< PCB of the thread context failed initialization, PCB was NULL. */ +#define QURT_ABORT_NO_PHYS_ADDR 0x22U /**< Unable to find the physical address for the virtual address. */ +#define QURT_ABORT_OUT_OF_FASTINT_CONTEXTS 0x23U /**< Fast interrupt contexts exhausted. */ +#define QURT_ABORT_CLADE_ERR 0x24U /**< Fatal error seen with CLADE interrupt. */ +#define QURT_ABORT_ETM_ERR 0x25U /**< Fatal error seen with ETM interrupt. */ +#define QURT_ABORT_ECC_DED_ASSERT 0x26U /**< ECC two-bit DED error. */ +#define QURT_ABORT_VTLB_ERR 0x27U /**< Fatal error in the VTLB layer. */ +#define QURT_ABORT_TLB_ENCODE_DECODE_FAILURE 0x28U /**< Failure during the TLB encode or decode operation. */ +#define QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE 0x29U /**< Failure to lookup entry in the page table. */ +#define QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE 0x30U /**< Failure to claim phy memory ownership. */ +#define QURT_ABORT_JTLB_SIZE_CHECK_FAIL 0x31U /**< JTLB size configured is more than actual size in hardware */ +#define QURT_ABORT_AUTOSTACK_ASSERT 0x32U /**< Error while handling stack flimit exception. */ + +/* Cause2 - TLB-miss_X - 8bits */ +#define QURT_TLB_MISS_X_FETCH_PC_PAGE 0x60U /**< */ +#define QURT_TLB_MISS_X_2ND_PAGE 0x61U /**< */ +#define QURT_TLB_MISS_X_ICINVA 0x62U /**< */ + +/* Cause2 - TLB-miss_RW - 8bits */ +#define QURT_TLB_MISS_RW_MEM_READ 0x70U /**< */ +#define QURT_TLB_MISS_RW_MEM_WRITE 0x71U /**< */ + +/** @cond rest_reg_dist */ +/* Cause2 - Floating point exception - 8 bits */ +#define QURT_FLOATING_POINT_EXEC_ERR 0xBFU /**< Execute floating-point. */ +/** @endcond */ + +/** Cause2 - autostackv2 - 8 bits */ +#define QURT_AUTOSTACKV2_CANARY_NOT_MATCH 0xC1U +#define QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE 0xC2U + +/** Cause2 - CFI violation - 8 bits */ +#define QURT_CFI_VIOLATION 0xC3U + +/** @cond rest_reg_dist*/ +/* Enable floating point exceptions */ +#define QURT_FP_EXCEPTION_ALL 0x1FU << 25 /**< */ +#define QURT_FP_EXCEPTION_INEXACT 0x1U << 29 /**< */ +#define QURT_FP_EXCEPTION_UNDERFLOW 0x1U << 28 /**< */ +#define QURT_FP_EXCEPTION_OVERFLOW 0x1U << 27 /**< */ +#define QURT_FP_EXCEPTION_DIVIDE0 0x1U << 26 /**< */ +#define QURT_FP_EXCEPTION_INVALID 0x1U << 25 /**< */ + +/** @endcond */ +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EXCEPT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fastint.h new file mode 100755 index 0000000000000..ea65dc0917fc0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fastint.h @@ -0,0 +1,71 @@ +#ifndef QURT_FASTINT_H +#define QURT_FASTINT_H + +/** + @file qurt_fastint.h + @brief QuRT fast interrupt functions + + Copyright (c) 2013-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_fastint_register + Register fast interrupt callback function + + Fast interrupt callback should be designed to perform the minimal necessary + actions for the interrupt, and/or perform some operations, such as signaling + another regular software thread to start any additional processing. + The callback should be a fast and short function. When a fast interrupt callback + is running, the corresponding interrupt cannot be re-enabled until the callback + returns. + + The fast interrupt callback must not use any system blocking calls, such as + mutex lock or signal wait. Otherwise, it results in errors. + + The fast interrupt callback function has a single integer argument and the + function ends with no return. The argument value passed in is the interrupt + number, and therefore a single callback function can handle + multiple fast interrupts. + + @param[in] intno Interrupt number to register. + @param[in] fn Interrupt callback function. + + @return + #QURT_EOK -- Fast interrupt registration is successful. \n + #QURT_EINVALID -- Interrupt is already registered. \n + #QURT_EINT -- Invalid interrupt number. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_register(int intno, void (*fn)(int)); + + +/*======================================================================*/ +/**@ingroup func_qurt_fastint_deregister + Deregisters the fast interrupt callback function. + + @param[in] intno Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 + (simulator only). + + @return + #QURT_EOK -- Interrupt deregistration is successful. \n + #QURT_EINT -- Invalid interrupt number (not registered). \n + #QURT_EINVALID -- Invalid interrupt number (already deregistered). + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_deregister(int intno); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FASTINT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fs_hub.h new file mode 100755 index 0000000000000..aaa050a6c838b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fs_hub.h @@ -0,0 +1,58 @@ +#ifndef QURT_FS_HUB_H +#define QURT_FS_HUB_H + +/** + @file qurt_fs_hub.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver that provides file-system functionality. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + This structure tracks a file-designator for a FS-hub QDI driver. + File system's QDI interface should use this object to encapsulate + true file-descriptor and return back a QDI handle. This QDI handle + will be used as file-descriptor by File-systm-hub. + */ + +typedef struct qurt_qdi_fs_obj +{ + qurt_qdi_obj_t qdi_obj; + int client_handle; + int fd; +}qurt_qdi_fs_obj_t; + + +/**@ingroup fs_hub_support_functions + This function allows a file-system to register it's QDI interface with file-system-hub. + Once registered, all file open operations for any filenames containing the mountpoint will + be forwarded to the QDI inteface. + + Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/" + + @param mtpoint mount point for the file-system being registered. + @param opener opener structure for the QDI driver interface + + @return + QURT_EOK -- Successfully registered QDI driver with file-system-hub. + Negative error code -- Failed to register with file-system-hub + */ +int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_futex.h new file mode 100755 index 0000000000000..1fdcc79a43f01 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_futex.h @@ -0,0 +1,82 @@ +#ifndef QURT_FUTEX_H +#define QURT_FUTEX_H +/** + @file qurt_futex.h + + @brief Prototypes of QuRT futex API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Functions +======================================================================*/ + + +/**@ingroup func_qurt_futex_wait + Moves the caller thread into waiting state when a memory object address + contains a value that is the same as a specified value. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait(void *lock, int val); + + +/**@ingroup func_qurt_futex_wait_cancellable + If a memory object address contains a value that is same as a specified + value, move the caller thread into waiting state. + The kernal can cancel the waiting state when there is a special need. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait_cancellable(void *lock, int val); + + +/**@ingroup func_qurt_futex_wake + Wakes up a specified number of threads that have been waiting + for the object change with qurt_futex_wait(). + + @param[in] lock Pointer to the object memory. + @param[in] n_to_wake Maximum number of threads to wake up. + + @return + number of threads to be woken up by this function + + @dependencies + None. + */ +int qurt_futex_wake(void *lock, int n_to_wake); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hmx.h new file mode 100755 index 0000000000000..e4037dbeae514 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hmx.h @@ -0,0 +1,226 @@ +#ifndef QURT_HMX_H +#define QURT_HMX_H +/** + @file qurt_hmx.h + @brief Prototypes of Qurt HMX API. + +Copyright (c) 2019-2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + + +/** @addtogroup hmx_types +@{ */ +/* HMX locking type */ +#define QURT_HMX_NON_SHARED_LOCK 0U /**< HMX locking type.*/ +#define QURT_HMX_SHARED_LOCK 1U /**< HMX locking type.*/ + +/* HMX unlocking type */ +#define QURT_HMX_NON_SHARED_UNLOCK 0U /**< HMX unlocking type.*/ +#define QURT_HMX_SHARED_UNLOCK 1U /**< HMX unlocking type.*/ + +/* HMX hardware context */ +#define QURT_HMX_UNIT_0 0U /**< HMX hardware context #0 */ +#define QURT_HMX_UNIT_1 1U /**< HMX hardware context #1 */ + /** @} */ /* end_addtogroup hmx_types */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_hmx_lock2 + Locks a HMX unit with the specified locking type. + + #QURT_HMX_NON_SHARED_LOCK: + - If a HMX unit is available, lock the unit and return success of #QURT_EOK. + - If the HMX unit is already locked by another thread, the caller thread is suspended + until the HMX is available and gets locked by this function. + - If there is no HMX hardware supported, returns #QURT_EVAL; + + #QURT_HMX_SHARED_LOCK: + - If a HMX unit is available, enables HMX access for the caller thread, and returns + success of #QURT_EOK. + - If the HMX is enabled on the caller thread, return #QURT_EFAILED. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller + thread, and return success of #QURT_EOK. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED. + - If the HMX is locked by a thread from another user process different from the + user process of the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX lock successful.\n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_lock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_unlock2 + Unlocks a HMX unit with the unlocking type. + + #QURT_HMX_NON_SHARED_UNLOCK: + - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the + HMX accumulators (assuming a fixed point type). + - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + #QURT_HMX_SHARED_UNLOCK: + - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the + HMX access on the caller thread, and return success of #QURT_EOK. + Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK + in its user process, the unlock function clears the HMX accumulators. + - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return + failure of #QURT_EFAILED. + - If the caller thread has not locked HMX, return failure of #QURT_EFAILED. + - If there is no HMX hardware supported, returns #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX is unlocked successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_unlock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_lock + Locks a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away. + If there is no HMX unit available, the caller is blocked until a HMX is available + and is locked by the function. + + @return + #QURT_EOK -- HMX lock successful. \n + #QURT_EFAILED -- Failure due to wrong locking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_lock(void); + + +/**@ingroup func_qurt_hmx_unlock + Unlocks a HMX unit. + If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its + accumulators(assuming fixed point type). + If there is no HMX unit locked by the caller thread, return failure. + + @return + #QURT_EOK -- HMX unlock successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_unlock(void); + + +/**@ingroup func_qurt_hmx_try_lock + Tries to lock a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away; + if there is no HMX unit available, the function returns failure without blocking the caller. + + @return + #QURT_EOK -- HMX lock successful \n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_try_lock(void); + + +/**@ingroup func_qurt_hmx_assign + Assign a HMX unit to a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, + kernel will detach it from the thread, and re-assign it to the target thread. + If the target thread has HVX enabled, it cannot have HMX enabled. + + Locking type + #QURT_HMX_NON_SHARED_LOCK: + - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK. + - If the HMX unit is already enabled on the target thread, return #QURT_EOK. + - If the HMX unit is already locked by another thread, detach the HMX from the thread. + Re-assign the HMX unit to the target thread, and return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] type Locking type + #QURT_HMX_NON_SHARED_LOCK -- non-shared lock + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is assigned successfully. This includes the case that \n + the target thread already has HMX assigned. \n + #QURT_EFAILED -- Failure due to wrong assigning conditions. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit ); + + +/**@ingroup func_qurt_hmx_release + Release a HMX unit from a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + + Qurt detaches the specified HMX unit from the target thread, and return success of + #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is released successfully. This includes the case that \n + the target thread already has the HMX released. \n + #QURT_EFAILED -- Failure due to wrong assigning condition. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit ); + + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HMX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hvx.h new file mode 100755 index 0000000000000..13c213d49ac84 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hvx.h @@ -0,0 +1,421 @@ +#ifndef QURT_HVX_H +#define QURT_HVX_H +/** + @file qurt_hvx.h + @brief Prototypes of QuRT HVX API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @cond */ + +typedef enum { + QURT_HVX_MODE_64B = 0, /**< HVX mode of 64 bytes */ + QURT_HVX_MODE_128B = 1 /**< HVX mode of 128 bytes */ +} qurt_hvx_mode_t; +/** @endcond */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @cond internal_only*/ +/** @addtogroup hvx_macros +@{ */ +#define QURT_HVX_HW_UNITS_2X128B_4X64B 0x00000204 /**< Bits 15 through 8 are for the number of 128B units. */ + /**< Bits 7 through 0 are for the number of 64B units. */ +#define QURT_HVX_HW_UNITS_4X128B_0X64B 0x00000400 +#define QURT_HVX_HW_UNITS_6X128B_0X64B 0x00000600 + +/* HVX locking status */ + +#define QURT_HVX_UNLOCKED (0) /* Has not locked HVX unit */ +#define QURT_HVX_LOCKED (1) /* Has locked HVX unit */ +#define QURT_HVX_ERROR (-1) /* Error, no HVX support */ + +/* Input value for HVX reservation */ + +#define QURT_HVX_RESERVE_ALL (4) /* All the HVX units in terms of 64B_MODE are requested to be reserved */ +#define QURT_HVX_RESERVE_ALL_AVAILABLE (0xff) /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */ + +/* Return values for HVX reservation */ + +#define QURT_HVX_RESERVE_NOT_SUPPORTED (-1) /* There is no HVX hardware, or less units in the hardware than requested */ +#define QURT_HVX_RESERVE_NOT_SUCCESSFUL (-2) /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */ +#define QURT_HVX_RESERVE_ALREADY_MADE (-3) /* There is already a HVX reservation made. */ +#define QURT_HVX_RESERVE_CANCEL_ERR (-4) /* The action of cancling the reservation fails because this protection domain has no reservation made before. */ + +// HVX set requests + +#define QURT_HVX_64B 0 /**< */ +#define QURT_HVX_128B 1 /**< */ +#define QURT_HVX_NO_USE 2 /**< */ +#define QURT_HVX_RELEASE_CONTEXT 3 /**< */ +#define QURT_HVX_IMMEDIATE_USE 4 /**< */ + +// HVX set masks + +#define QURT_HVX_64B_PREFERRED (1<<(QURT_HVX_64B + 8))/**< */ +#define QURT_HVX_128B_PREFERRED (1<<(QURT_HVX_128B + 8))/**< */ +#define QURT_HVX_64B_ACCEPTABLE (1<<(QURT_HVX_64B + 12))/**< */ +#define QURT_HVX_128B_ACCEPTABLE (1<<(QURT_HVX_128B + 12))/**< */ + +// HVX set return "result" + +#define QURT_EOK 0 /**< */ +#define QURT_HVX_SET_ERROR 0xFF /**< */ + +// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE +#define QURT_HVX_64B_ASSIGNED (1<<(QURT_HVX_64B + 8)) /**< */ +#define QURT_HVX_128B_ASSIGNED (1<<(QURT_HVX_128B + 8)) /**< */ + +// Sizes of HVX dump buffer + +#define QURT_HVX_V65_64B_VSIZE 2084U /**< 64 x 32 + 8 x 4 + 4 (version). */ +#define QURT_HVX_V65_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V66_128B_VSIZE 4420U /**< 128 x (32 +2) + 16 x 4 + 4 (version). */ +#define QURT_HVX_V68_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V79_128B_VSIZE 4740U /**< 128 x (32+4+1) + 4 (version). */ +#define QURT_HVX_VREG_BUF_SIZE QURT_HVX_V79_128B_VSIZE /**< */ + +// HVX dump versions + +#define QURT_HVX_DUMP_V65_64B 1U /**< */ +#define QURT_HVX_DUMP_V65_128B 2U /**< */ +#define QURT_HVX_DUMP_V66_128B 3U /**< */ +#define QURT_HVX_DUMP_V68_128B 4U /**< */ +#define QURT_HVX_DUMP_V79_128B 5U /**< */ +/** @} */ /* end_addtogroup hvx_macros */ +/** @endcond */ +/** @cond */ +// Qurt data struct for hvx_set input +typedef struct qurt_hvx_set_struct_ { + unsigned char set_req; // LSB + struct { + unsigned char preferred_mask:4; + unsigned char acceptable_mask:4; + }; + unsigned short resvd; // MSB +} qurt_hvx_set_struct_t; // 4 bytes + + +// Qurt data struct for hvx_set return +typedef struct qurt_hvx_set_return_str_ { + unsigned char result; // LSB + unsigned char hvx_mode_assigned; + unsigned short resvd; // MSB +} qurt_hvx_set_return_struct_t; // 4 bytes +/** @endcond */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_hvx_lock + Locks one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns right away. + If the current HVX mode is different from the requested mode, the current + thread is blocked. When all HVX units become idle, QuRT changes + the mode, locks the HVX unit, and returns. + + Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is + mapped as qurt_hvx_set(64_BYTE or 128_BYTE). + + @datatypes + #qurt_mode_t + + @param[in] lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B. + + @return + #QURT_EOK -- Success \n + Other value -- Failure + + @dependencies + None. + + */ +int qurt_hvx_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_unlock + Unlocks the HVX unit held by this software thread. + + @note1hang Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock() + maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT). + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_unlock(void); + +/**@ingroup func_qurt_hvx_try_lock + Tries to lock one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns #QURT_EOK; Otherwise, + the function returns a failure, but does not block the current software + thread to wait for the HVX unit. + Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock() + maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask); + + @datatypes + #qurt_mode_t + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_get_mode + Gets the current HVX mode configured by QuRT. + + @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on + the current HVX configuration. + + @param[out] + None. + + @return + #QURT_HVX_MODE_128B \n + #QURT_HVX_MODE_64B \n + -1 -- Not available. + + @dependencies + None. + */ +int qurt_hvx_get_mode(void); + + +/**@ingroup func_qurt_hvx_get_units + Gets the HVX hardware configuration that the chipset supports. + + @note1hang The function returns the HVX hardware configuration supported by the chipset. + + @return + Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n + - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n + - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n + - 0 -- not available + + @dependencies + None. + + */ +int qurt_hvx_get_units(void); + + +/**@ingroup func_qurt_hvx_reserve + Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + If one HVX unit is already locked by the application in the same PD, the unit is + added to the returned count as one reserved unit for the PD. + Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve() + only does basic sanity checks on HVX units. + + @datatypes + None. + + @param[in] num_units Number of HVX units in terms of 64B_MODE to reserve for the PD. + QURT_HVX_RESERVE_ALL to reserve all the HVX units. + QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units. + + @return + Number of units successfully reserved, including the units already locked in the same PD. \n + #QURT_HVX_RESERVE_NOT_SUPPORTED \n + #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n + #QURT_HVX_RESERVE_ALREADY_MADE + + + @dependencies + None. + + */ +int qurt_hvx_reserve(int num_units); + + +/**@ingroup func_qurt_hvx_cancel_reserve + Cancels the HVX reservation in the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + + @return + 0 -- Success \n + #QURT_HVX_RESERVE_CANCEL_ERR -- Failure + + @dependencies + None. + + */ +int qurt_hvx_cancel_reserve(void); + + +/**@ingroup func_qurt_hvx_get_lock_val + Gets the HVX locking status value of the thread of the caller. + + @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not. + + @datatypes + None. + + @return + #QURT_HVX_UNLOCKED \n + #QURT_HVX_LOCKED \n + #QURT_HVX_ERROR + + @dependencies + None. + */ +int qurt_hvx_get_lock_val(void); + +/** @cond internal_only*/ +/**@ingroup func_qurt_hvx_set + Sets the HVX configuration for the software thread of the caller. + + @datatypes + None. + + @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask + | hvx_acceptable_mode_mask where set_request can be set to: \n + - #QURT_HVX_64B \n + - #QURT_HVX_128B \n + - #QURT_HVX_NO_USE \n + - #QURT_HVX_RELEASE_CONTEXT \n + - #QURT_HVX_IMMEDIATE_USE \n + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_preferred_mode_mask can be set to: \n + - #QURT_HVX_64B_PREFERRED \n + - #QURT_HVX_128B_PREFERRED + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_acceptable_mode_mask can be set to: \n + - #QURT_HVX_64B_ACCEPTABLE \n + - #QURT_HVX_128B_ACCEPTABLE @tablebulletend + + @return + Result of the HVX setting in the least significant 8 bits of the returned data. \n + #QURT_EOK -- 0 \n + #QURT_HVX_SET_ERROR -- 0xFF \n + When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, + bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n + - #QURT_HVX_64B_ASSIGNED \n + - #QURT_HVX_128B_ASSIGNED + + @dependencies + None. + */ +unsigned int qurt_hvx_set(unsigned int input_arg); + + +/**@ingroup func_qurt_system_hvx_regs_get_maxsize + Returns the maximum buffer size for saving HVX registers. + + @datatypes + None. + + @return + 0 -- No HVX supported in the target. \n + #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers. + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get_maxsize(void); + + +/**@ingroup func_qurt_system_hvx_regs_get_size + Returns the buffer size for saving HVX registers for a specified thread. + + @param[in] thread_id Thread ID of the target thread. + + @return + 0 -- No HVX assgined to the thread. \n + size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n + - #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + + @dependencies + None. + + */ +unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id); + + + +/**@ingroup func_qurt_system_hvx_regs_get + Saves the HVX registers into the specified buffer. + Returns the size of the data saved into the buffer. + After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer + from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0. + + @param[in] thread_id Thread ID of the target thread. + @param[in] pBuf Pointer to the buffer for HVX register saving. + The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from + the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. + For example, a buffer can be declared at first as: \n + unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n + unsigned char *pBuf; \n + then align the buffer pointer to: \n + pBuf = vbuf; \n + pBuf += (256 - 4 - (unsigned)pBuf%256); + @param[in] size Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that + returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above. + @param[out] pBuf Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith + byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes + contain one of the HVX dump versions:\n + - #QURT_HVX_DUMP_V65_64B \n + - #QURT_HVX_DUMP_V65_128B \n + - #QURT_HVX_DUMP_V66_128B \n + - #QURT_HVX_DUMP_V68_128B \n + - #QURT_HVX_DUMP_V79_128B \n + @tablebulletend + + @return + Total bytes of the data saved in the provided buffer. \n + 0 -- No HVX assigned to the thread \n + #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HVX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_int.h new file mode 100755 index 0000000000000..386aeda1051eb --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_int.h @@ -0,0 +1,509 @@ +#ifndef QURT_INT_H +#define QURT_INT_H +/** + @file qurt_int.h + @brief QuRT interrupt functions. + + + + Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/** @cond rest_reg_dist */ +/** @addtogroup interrupts_constants +@{ */ +#define SIG_INT_ABORT 0x80000000 /**< */ +#define QURT_INT_NON_DELAYED_ACK 0 +#define QURT_INT_DELAYED_ACK 1 +#define QURT_INT_ACK_DEFAULT QURT_INT_NON_DELAYED_ACK +#define QURT_INT_DRV_DEFAULT 0 +#define QURT_INT_PRIORITY_DEFAULT 0xFF + +/** QuRT interrupt property. */ +#define QURT_INT_CONFIGID_POLARITY 0x1U /**< */ +#define QURT_INT_CONFIGID_LOCK 0x2U /**< */ + +/** QuRT interrupt lock.*/ +#define QURT_INT_LOCK_DEFAULT 0x0 /**< Default. */ +#define QURT_INT_LOCK_DISABLE 0x0 /**< Interrupt can be enabled or disabled or deregistered. */ +#define QURT_INT_LOCK_ENABLE 0x1 /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/ +/** @} */ /* end_addtogroup interrupts_constants */ + +/** @addtogroup Qurt_interrupt_type +@{ */ +/** Trigger type bit fields for a PDC interrupt:\n + @verbatim + Polarity Edge Output\n + 0 00 Level sensitive active low + 0 01 Rising edge sensitive + 0 10 Falling edge sensitive + 0 11 Dual edge sensitive + 1 00 Level sensitive active high + 1 01 Falling edge sensitive + 1 10 Rising edge sensitive + 1 11 Dual edge sensitive + @endverbatim +*/ +#define QURT_INT_TRIGGER_TYPE_SET(pol, edge) ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */ + +#define QURT_INT_TRIGGER_LEVEL_LOW QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_LEVEL_HIGH QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_RISING_EDGE QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_FALLING_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_DUAL_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U) /**< */ +#define QURT_INT_TRIGGER_USE_DEFAULT 0xffU /**< */ +/** @} */ /* end_addtogroup Qurt_interrupt_type */ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_interrupt_register + @xreflabel{sec:interrupt_register} + Registers the interrupt.\n + Enables the specified interrupt and associates it with the specified QuRT signal object and + signal mask. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask); + +/**@ingroup func_qurt_interrupt_register2 + @xreflabel{sec:interrupt_register2} + Registers the interrupt.\n + Enables the specified interrupt, associates it with the specified QuRT signal object and + signal mask, and sets interrupt flags. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals that the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value #QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value #QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + #QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + @param[in] flags Defines interrupt property, supported property is interrupt lock enable/disable. + Possible values for flags: \n + - #QURT_INT_LOCK_ENABLE + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags); +/* + * Waits for registered interrupt signal + + * Suspend the current thread until one of its registered interrupts occurs. The second input mask, + * contains the interrupt signals the IST expects to receive. The interrupt signals are registered + * with interrupts via qurt_register_interrupt API. + * + * The signals returned in the signal variable indicate which interrupts occurred. Use function + * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to + * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST + * must quit from interrupt receiving loop. + * + * For detail information on this API, see QuRT User Manual Section 4.2.5 + * + * Prototype + * + * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask) + */ + +/**@ingroup func_qurt_interrupt_acknowledge + Acknowledges an interrupt after it has been processed.\n + Re-enables an interrupt and clears its pending status. This is done after an interrupt is + processed by an IST. + + Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST + performs the acknowledge operation after it has finished processing the interrupt and + just before suspending itself (such as by waiting on the interrupt signal). + + @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt, + an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before + acknowledging the interrupt. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Interrupt acknowledge was successful. \n + #QURT_EDEREGISTERED -- Interrupt is already de-registered. + + @dependencies + None. +*/ +int qurt_interrupt_acknowledge(int int_num); + +/**@ingroup func_qurt_interrupt_deregister + Disables the specified interrupt and disassociates it from a QuRT signal object. + If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation + returns the status value #QURT_EINT. + + @note1hang If an interrupt is deregistered while an IST waits + to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid + this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an + IST after determining that it has no interrupts registered. + + @param[in] int_num L2VIC to deregister; valid range is 0 to 1023. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number (not registered). + + @dependencies + None. + +*/ +unsigned int qurt_interrupt_deregister(int int_num); +/** @endcond */ + +/**@ingroup func_qurt_interrupt_disable + Disables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + After qurt_interrupt_disable() returns, the Hexagon subsystem + can no longer send the corresponding interrupt to the Hexagon + core, until qurt_interrupt_enable() is called + for the same interrupt. + + Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within + a short period of time.\n + - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() + is called. Therefore, some time later, the pending interrupt is received on a Hexagon + hardware thread.\n + - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon + hardware automatically disables the interrupt until kernel software re-enables the interrupt + at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain + thread at an ealier time, the interrupt is re-enabled earlier and can trigger + sending a new interrupt to the Hexagon core while kernel software is still processing + the previous interrupt. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully disabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_disable(int int_num); + + +/**@ingroup func_qurt_interrupt_enable + Enables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully enabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. + +*/ + unsigned int qurt_interrupt_enable(int int_num); + + +/**@ingroup func_qurt_interrupt_status + Returns a value that indicates the pending status of the specified interrupt. + + @param[in] int_num Interrupt number that is being checked. + @param[out] status Interrupt status; 1 indicates that an interrupt is + pending, 0 indicates that an interrupt is not pending. + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_status(int int_num, int *status); + + +/**@ingroup func_qurt_interrupt_get_status + Gets the status of the specified interrupt in L2VIC. + + @param[in] int_num Interrupt number that is being checked. + @param[in] status_type 0 -- interrupt pending status \n + 1 -- interrupt enabling status + @param[out] status 0 -- OFF \n + 1 -- ON + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status); + +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_interrupt_clear + Clears the pending status of the specified interrupt. + + @note1hang This operation is intended for system-level use, and must be used with care. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_clear(int int_num); + + +/**@ingroup func_qurt_interrupt_get_config + Gets the L2VIC interrupt configuration. \n + This function returns the type and polarity of the specified L2VIC interrupt. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[out] int_type Pointer to an interrupt type. \n + 0 -- Level-triggered interrupt \n + 1 -- Eedge-triggered interrupt + @param[out] int_polarity Pointer to interrupt polarity.\n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt. + + @return + #QURT_EOK -- Configuration successfully returned.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity); + +/**@ingroup func_qurt_interrupt_set_config + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang Deregister L2VIC interrupts before reconfiguring them. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Interrupt type. \n + 0 -- Level-triggered interrupt\n + 1 -- Edge-triggered interrupt + @param[in] int_polarity Interrupt polarity. \n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity); + +/**@ingroup func_qurt_interrupt_set_config2 + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Notified to the hardware configuration callback function and used to + modify the L2VIC type. Possible values: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type); + +/**@ingroup func_ qurt_interrupt_set_config3 + Sets the specified configuration value for the specified property of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity. + + @param[in] int_num L2VIC interrupt to re-enable. + @param[in] config_id Property to configure: \n + - #QURT_INT_CONFIGID_POLARITY \n + - #QURT_INT_CONFIGID_LOCK @tablebulletend + @param[in] config_val Dependent on the second argument config_id, specifies the value to set. \n + Values for #QURT_INT_CONFIGID_POLARITY: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE \n + + Values for #QURT_INT_CONFIGID_LOCK: \n + - #QURT_INT_LOCK_ENABLE\n + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. +*/ +unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val); + + +/**@ingroup func_qurt_interrupt_raise + Raises the interrupt. \n + This function triggers a level-triggered L2VIC + interrupt, and accepts interrupt numbers in the range of 0 to 1023. + + @param[in] interrupt_num Interrupt number. + + @return + #QURT_EOK -- Success \n + -1 -- Failure; the interrupt is not supported. + + @dependencies + None. + */ +int qurt_interrupt_raise(unsigned int interrupt_num); + +/**@ingroup func_qurt_interrupt_raise2 + Raises the interrupt and returns the current pcycle value. + + @param[in] interrupt_num Interrupt number. + + @return + 0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n + Other value -- pcycle count at the time the interrupt is raised. + + @dependencies + None. + */ +unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_isr_subcall + Indicates whether the current function is called from a callback procedure (either short or long). + + @return + #QURT_EOK -- TRUE \n + #QURT_EVAL -- FALSE. + + @dependencies + None. + */ +int qurt_isr_subcall(void); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_INT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_island.h new file mode 100755 index 0000000000000..f0c8ee27cf8b0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_island.h @@ -0,0 +1,122 @@ +#ifndef QURT_ISLAND_H +#define QURT_ISLAND_H + +/** + @file qurt_island.h + @brief Prototypes of power API + The APIs allow entering and exiting island mode where the memory + accesses are limited to local memory. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_island_get_status + Gets Island mode status. + + Returns a value that indicates whether the QuRT system executes in Island mode. + + @return + 0 - Normal mode. \n + 1 - Island mode. + + @dependencies + None. +*/ +unsigned int qurt_island_get_status (void); + +/**@ingroup func_qurt_island_get_status2 + Gets Island mode status especially that differentiates between island partial exit and complete exit. + + Returns a value that indicates the current state. + + @note1hang Transition from NORMAL mode to ISLAND mode happens in single + threaded mode. Whereas transition from ISLAND mode to other modes + happen in multi-threaded mode. So, a thread that gets island mode + status as NORMAL can assume the same status till it continues to + run. A thread that gets island mode status as ISLAND should + assume that the status may change to EXITING or NORMAL while it + runs. A thread that gets island mode status as EXITING should + assume that the status may change to NORMAL while it runs. If + the thread goes to wait state in after reading the status, it should get + the island mode state again and not assume the previous state. + @note2hang This api returns more intrinsic states than qurt_island_get_status, + when qurt_island_get_status returns 0, this api could return + QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND + + @param[in/out] data field is reserved for future use. If NULL pointer is passed, + the field will be ignored. If a valid pointer is passed, + QuRT will return back a bitmask which can be interpreted as follows: + data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. + Otherwise set to 0. + data[30:0] – Reserved for future definition. + + @return + QURT_ISLAND_MODE_NORMAL - Main mode \n + QURT_ISLAND_MODE_ISLAND - Island mode \n + QURT_ISLAND_MODE_EXITING - Exiting Island mode \n + + @dependencies + None. +*/ +unsigned int qurt_island_get_status2 (unsigned int *data); + + + +/**@ingroup func_qurt_island_get_exit_status + Gets the reason for the last Island mode exit status. + + @param[out] cause_code Pointer that returns the cause code of the last + island exit reason. \n + - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n + - #QURT_ENOISLANDENTRY -- API called before exiting island. \n + - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend + + @param[out] int_num Pointer that holds the invalid interrupt number that caused + island exit when the cause code is #QURT_EISLANDINVALIDINT. + For other cases, it is -1. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num); + +/**@ingroup func_qurt_island_get_enter_timestamp + Gets the recent timestamp when the system exits STM during island enter. + + @param[out] island_enter_timestamp Returns a pointer to the recent timestamp + recorded after the system exits STM during island enter. If the system never + attempts to enter island, the island_enter_timestamp return pointer holds a value + of zero. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISLAND_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_isr.h new file mode 100755 index 0000000000000..db29ea2f265d7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_isr.h @@ -0,0 +1,177 @@ +#ifndef QURT_ISR_H +#define QURT_ISR_H + +/*===================================================================== + + @file qurt_isr.h + + @brief Prototypes of Qurt ISR API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2017, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + Functions +=============================================================================*/ + + +/**@ingroup func_qurt_isr_set_hw_config_callback + Set callback function for the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_config_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_enable_callback + Set callback function for enabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_enable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_disable_callback + Set callback function for disabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_disable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_create + Creates an ISR thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + + @return + #QURT_EVAL -- Invalid arguments + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr); + +/**@ingroup func_qurt_isr_register2 + Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes. + The interrupt is enabled when this function returns success. + + @datatypes + qurt_thread_t + + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create() + @param[in] int_num The interrupt number + @param[in] prio Priority of the ISR + @param[in] flags Defines ACK type. Values : \n + QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the Kernel. + QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + @param[in] int_type. Notifies it to registered function. Values: \n + - QURT_INT_TRIGGER_USE_DEFAULT + - QURT_INT_TRIGGER_LEVEL_HIGH + - QURT_INT_TRIGGER_LEVEL_LOW + - QURT_INT_TRIGGER_RISING_EDGE + - QURT_INT_TRIGGER_FALLING_EDGE + - QURT_INT_TRIGGER_DUAL_EDGE + @param[in] isr Interrupt Service Routine with proto type void isr (void *arg, int int_num) + @param[in] arg 1st argument of the ISR when it is called to service the interrupt + + @return + QURT_EOK -- Successfully registered the ISR for the interrupt + QURT_EINT -- Interrupt not configured + QURT_EINVALID -- Invalid Thread ID + QURT_EDISABLED -- The feature is disabled + QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_isr_deregister2 + De-registers the ISR for the specified interrupt. + The interrupt is disabled when this function returns success. + + @param[in] int_num The interrupt number + + @return + QURT_EOK -- ISR deregistered successfully + QURT_ENOREGISTERED -- Interrupt with int_num is not registered + + @dependencies + None. + */ +int qurt_isr_deregister2 (int int_num); + +/**@ingroup func_qurt_isr_delete + ISR thread will exit and releases Kernel resources + + @note1hang The ISR thread shouldn't be actively processing interrupts, + otherwise the call will fail and return an error. + + @param[in] thread-id of the ISR thread that needs to be deleted. + + @return + QURT_ENOTALLOWED -- ISR thread is processing an interrupt + QURT_EINVALID -- Invalid ISR thread ID + QURT_EOK -- Success + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_delete (qurt_thread_t isr_tid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISR_H */ + + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_l2cfg.h new file mode 100755 index 0000000000000..7e26b30a580d9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_l2cfg.h @@ -0,0 +1,98 @@ +#ifndef QURT_L2CFG_H +#define QURT_L2CFG_H +/** + @file qurt_l2cfg.h + @brief QuRT APIs for L2 configuration and system configuration + +EXTERNAL FUNCTIONS + qurt_l2cfg_set + qurt_l2cfg_get + qurt_system_config_get + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +/* Definition for system configuration */ +/** @addtogroup l2cfg_macros +@{ */ +#define QURT_CORE_CFG_HMX_INT8_SPATIAL 0x78 /**< HMX fixed-point spatial size */ +#define QURT_CORE_CFG_HMX_INT8_DEPTH 0x7C /**< HMX fixed-point output depth */ +/** @} */ /* end_addtogroup l2cfg_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_l2cfg_set + Sets the value of a L2 configuration register. A register can be set *IFF* its + initial value is configured. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[in] value Value to set the register to. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely + a configuration problem. \n + #QURT_EINVALID -- Argument error. \n + #QURT_ENOTALLOWED -- Setting this register is prohibited. + + @dependencies + None. + */ +int qurt_l2cfg_set (unsigned short offset, unsigned int value); + +/**@ingroup func_qurt_l2cfg_get + Gets the value of a L2 configuration register. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[out] value Pointer to value of the register. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; + likely a configuration problem. \n + #QURT_EINVALID -- Argument error. + + @dependencies + None. + + */ +int qurt_l2cfg_get (unsigned short offset, unsigned int * value); + + +/**@ingroup func_qurt_system_config_get + Gets the system configuration information. + + @param[in] index Index to system configuration. Values:\n + - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n + - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend + + @param[out] data Pointer to a word for returned data. + + @return + #QURT_EOK -- Get the configuration data successful. \n + Other values -- Failure (no such configuration available). + + @dependencies + None. + + */ +int qurt_system_config_get(int index, unsigned int *data); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_L2CFG_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_lifo.h new file mode 100755 index 0000000000000..dc399fccc5f0f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_lifo.h @@ -0,0 +1,71 @@ +#ifndef QURT_LIFO_H +#define QURT_LIFO_H +/** + @file qurt_lifo.h + + @brief + Provide lock free LastInFirstOut algorithm, which can be used in a + variety of situations for allocation/free fixed size buffer + This implementation touches the first word of your FREED buffer. Even + though it does not matter how you use it when it is allocated, you might want + to be a bit careful not to put your MAGIC number as the first field. + Because it will not hold the magic value for "freed" + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ +/** + Pops an element out of the LIFO. + + @param[in] freelist Pointer to the head of your list. + + @return + Top object from the list + + @dependencies + None. +*/ +/* ======================================================================*/ +void * qurt_lifo_pop(void *freelist); + + +/*======================================================================*/ +/** + Pushes an element into the LIFO. + + @param[in] freelist Pointer to the head of your list. + @param[in] buf Pointer to your buffer to push into the list. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_lifo_push(void *freelist, void *buf); + +void qurt_lifo_remove(void *freelist, void *buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_LIFO_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mailbox.h new file mode 100755 index 0000000000000..a6cd91c611782 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mailbox.h @@ -0,0 +1,176 @@ +#ifndef QURT_MAILBOX_H +#define QURT_MAILBOX_H + +/** + @file qurt_mailbox.h + @brief Definitions, macros, and prototypes used for QuRT mailbox + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* Definitions on typedef and return values */ + +#define QURT_MAILBOX_ID_NULL 0 +#define QURT_MAILBOX_ERROR -1 +#define QURT_MAILBOX_ID_ERROR -2 +#define QURT_MAILBOX_NON_VALID_DATA -3 +#define QURT_MAILBOX_FULL -4 +#define QURT_MAILBOX_DELETED -5 +#define QURT_MAILBOX_RECEIVE_HALTED -6 +#define QURT_MAILBOX_BANDWIDTH_LIMIT -7 + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ + +#define QURT_MAILBOX_AT_QURTOS 0U // Receiver is QurtOS +#define QURT_MAILBOX_AT_ROOTPD 1U // Receiver is RootPD (ASID=0) +#define QURT_MAILBOX_AT_USERPD 2U // Receiver is User PD (ASID!=0) +#define QURT_MAILBOX_AT_SECUREPD 3U // Receiver is Secure PD + +typedef unsigned char qurt_mailbox_receiver_cfg_t; + +#define QURT_MAILBOX_SEND_OVERWRITE 0U // When there is already valid content, overwrite it +#define QURT_MAILBOX_SEND_NON_OVERWRITE 1U // When there is already valid content, return failure + +typedef unsigned char qurt_mailbox_send_option_t; + + +#define QURT_MAILBOX_RECV_WAITING 0U // When there is no valid content, wait for it +#define QURT_MAILBOX_RECV_NON_WAITING 1U // When there is no valid content, return failure immediately +#define QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U // Read the content, but doesn't remove it from the mailbox. No waiting. + +typedef unsigned char qurt_mailbox_recv_option_t; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/* Function prototype */ + +/**@ingroup qurt_mailbox_create + Creates a QuRT mailbox. + + @param name Mailbox name up to 8 characters. + @param recv_opt Configuration on the receiver process. + + @return + Mailbox ID -- Mailbox Identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at creating mailbox + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt); + + +/**@ingroup qurt_mailbox_get_id + Gets a QuRT mailbox identifier. + + @param name Mailbox name up to 8 characters. + + @return + Mailbox ID -- Mailbox identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_get_id(char *name); + + +/**@ingroup qurt_mailbox_send + Sends data to a QuRT mailbox. + + @param mailbox_id Mailbox identifier. + @param send_opt Option for mailbox send. + @param data Data to send. + + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors.\n + #QURT_MAILBOX_FULL Valid data already exists, non-overwriting.\n + #QURT_MAILBOX_BANDWIDTH_LIMIT Reached the bandwidth limitation. + + @dependencies + None. +*/ +int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data); + + +/**@ingroup qurt_mailbox_receive + Receive data from QuRT mailbox + + @param mailbox_id Mailbox Identifier + @param send_opt Option for mailbox receiving + @param data Pointer to data buffer for receiving + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. \n + #QURT_MAILBOX_NON_VALID_DATA No current valid data, put the previous content in the buffer. \n + #QURT_MAILBOX_RECEIVE_HALTED Receive halted, the waiting thread is woken up. \n + #QURT_MAILBOX_DELETED Mailbox is deleted, and the waiting thread is woken up. + + @dependencies + None. +*/ +int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data); + + +/**@ingroup qurt_mailbox_delete + Deletes a QuRT mailbox. + + A mailbox can only be deleted from the process that created the mailbox. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_delete(unsigned long long mailbox_id); + + +/**@ingroup qurt_mailbox_receive_halt + Halts a QuRT mailbox receiving and wakes up waiting threads. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_receive_halt(unsigned long long mailbox_id); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_MAILBOX_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_memory.h new file mode 100755 index 0000000000000..90ce2586fec50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_memory.h @@ -0,0 +1,1487 @@ +#ifndef QURT_MEMORY_H +#define QURT_MEMORY_H +/** + @file qurt_memory.h + @brief Prototypes of kernel memory API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include +//#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup memory_management_macros +@{ */ +#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all + processes.*/ +/** @} */ /* end_addtogroup memory_management_macros */ +/**@cond rest_reg_dist */ +/** @addtogroup memory_management_types +@{ */ +/** @xreflabel{hdr:qurt_mem_default_pool} */ +extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/ +/** @} */ /* end_addtogroup memory_management_types */ + +/** @cond rest_reg_dist */ +/** Mapping attribute information*/ +typedef struct{ + qurt_paddr_64_t paddr; + qurt_size_t size ; + qurt_mem_cache_mode_t cache_mode; + qurt_perm_t perms ; +}qurt_mapping_attr_t; +/** @endcond */ +/** @} */ /* end_addtogroup mapping_attribute_types*/ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_mem_cache_clean + Performs a cache clean operation on the data stored in the specified memory area. + Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater. + + @note1hang Perform the flush all operation only on the data cache. + + @note1cont This operation flushes and invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed and invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_ALL\n + @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend + @param[in] type Cache type. Values: + - #QURT_MEM_ICACHE + - #QURT_MEM_DCACHE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type.\n + + @dependencies + None. +*/ +int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_clean2 + Performs a data cache clean operation on the data stored in the specified memory area. + + This API only performs the following data cache operations:\n + - #QURT_MEM_CACHE_FLUSH\n + - #QURT_MEM_CACHE_INVALIDATE\n + - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed/invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values:\n #QURT_MEM_CACHE_FLUSH\n #QURT_MEM_CACHE_INVALIDATE\n + #QURT_MEM_CACHE_FLUSH_INVALIDATE + @param[in] type Cache type. Values: \n #QURT_MEM_DCACHE + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type. + + @dependencies + None. +*/ +int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_phys_clean + Performs a cache clean operation on the data stored in the specified memory area based on address match and mask. + Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch. + + @note1hang The addrmatch value should be the upper 24-bit physical address to match against. + + @datatypes + #qurt_mem_cache_op_t \n + + @param[in] mask 24-bit address mask. + @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid operation + + @dependencies + None. +*/ + +int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode); + +/**@ingroup func_qurt_mem_l2cache_line_lock + Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory. + + @note1hang Perform the line lock operation only on the 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to lock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success.\n + #QURT_EALIGN -- Data alignment or address failure. + #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size) + #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address + in the range of addr and addr+size or the address range is not L2 cacheable + @dependencies + None. +*/ +int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_l2cache_line_unlock + Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory. + + @note1hang Perform the line unlock operation only on a 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to unlock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success. \n + #QURT_EALIGN -- Aligning data or address failure. \n + #QURT_EFAILED -- Operation failed, cannot find the matching tag. + + @dependencies + None. +*/ +int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_region_attr_init + @xreflabel{sec:qurt_mem_region_attr_init} + Initializes the specified memory region attribute structure with default attribute values: \n + - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n + - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n + - Physical address -- -1 \n + - Virtual address -- -1 \n + - Memory type -- #QURT_MEM_REGION_LOCAL \n + - Size -- -1 + + @note1hang The memory physical address attribute must be explicitly set by calling the + qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly + as parameters in the memory region create operation. + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the destination structure for the memory region attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attach + Initializes a memory pool object to attach to a pool predefined in the system + configuration file. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. They are specified in memory region create operations + (Section @xref{sec:mem_region_create}). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach + operation is necessary only when allocating memory regions in nonstandard + memory units such as TCM. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_attach2 + Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL. + The client_handle is used to look up the client specific pool. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. Memory pool objects are specified during mapping creation operations + (qurt_mem_mmap() and qurt_mem_region_create()). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2 + operation is necessary only when allocating memory regions in memory units specific to the client. + + @datatypes + #qurt_mem_pool_t + + @param[in] client_handle Client identifier used by the OS to lookup the identifier + for client specific pool + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_create + @xreflabel{hdr:qurt_mem_pool_create} + Dynamically creates a memory pool object from a physical address range. + + The pool is assigned a single memory region with the specified base address and size. + + The base address and size values passed to this function must be aligned to 4K byte + boundaries, and must be expressed as the actual base address and size values divided by 4K. + + For example, the function call: + @code + qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool) + @endcode + ... is equivalent to the following static pool definition in the QuRT system configuration file: + @code + + + + @endcode + + @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond + + @note1hang Dynamically created pools are not identical to static pools. In particular, + qurt_mem_pool_attr_get() is not valid with dynamically created pools. + + @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[in] base Base address of the memory region (divided by 4K). + @param[in] size Size (in bytes) of the memory region (divided by 4K). + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_add_pages + Adds a physical address range to the specified memory pool object.\n + + @note1hang Call this operation only with root privileges (guest OS mode). + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_mem_pool_add_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages); + +/**@ingroup func_qurt_mem_pool_remove_pages + Removes a physical address range from the specified memory pool object. + + If any part of the address range is in use, this operation returns an + error without changing the state. + + @note1hang Call this operation only with root privileges (guest-OS mode). + + @note1cont In the future, this operation will support (via the flags parameter) the + removal of a physical address range when part of the range is in use. + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + @param[in] flags Remove options. Values: \n + - 0 -- Skip holes in the range that are not part of the pool (default) \n + - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified + physical address range is entirely contained (with no holes) in the + pool free space. @tablebulletend + @param[in] callback Callback procedure called when pages were successfully removed. + Not called if the operation failed. Passing 0 as the parameter + value causes the callback to not be called. + @param[in] arg Value passed as an argument to the callback procedure. + + @return + #QURT_EOK -- Pages successfully removed. + + @dependencies + None. +*/ +int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages, + unsigned flags, + void (*callback)(void *), + void *arg); +/**@ingroup memory_management_types*/ +#define QURT_POOL_REMOVE_ALL_OR_NONE 1 /**< */ + +/**@ingroup func_qurt_mem_pool_attr_get + Gets the memory pool attributes. \n + Retrieves pool configurations based on the pool handle, and fills in + the attribute structure with configuration values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_attr_t + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[out] attr Pointer to the memory region attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attr_get_size + Gets the size of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] size Pointer to the destination variable for the range size. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*size) = 0; + return QURT_EINVALID; + } + else { + (*size) = attr->ranges[range_id].size; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr + Gets the start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; + } + else { + (*addr) = (attr->ranges[range_id].start)<<12; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr_64 + Gets the 64 bit start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_64_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){ +if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; +} +else { + (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12; + } + return QURT_EOK; + } + + +/**@ingroup func_qurt_mem_pool_status_get + Gets the memory pool status. \n + Based on the pool handle, retrieves largest contiguous free memory, + total free memory, and total memory declared for the pool in bytes. Fills in + the memory status structure with the values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_status_t + + @param[in] pool Pool handle. + @param[out] status Pointer to the memory pool status structure. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status); + + +/**@ingroup func_qurt_mem_pool_is_available + Checks whether the number of pages that the page_count argument indicates + can be allocated from the specified pool. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_mem_mapping_t \n + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[in] page_count Number of 4K pages. + @param[in] mapping_type Variable of type qurt_mem_mapping_t. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Mapping_type is invalid. \n + #QURT_EMEM -- Specified pages cannot be allocated from the pool. + + @dependencies + None. +*/ +int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type); + + +/**@ingroup func_qurt_mem_region_create + @xreflabel{sec:mem_region_create} + Creates a memory region with the specified attributes. + + The application initializes the memory region attribute structure with + qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr(). + + If the virtual address attribute is set to its default value + (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is + automatically assigned any available virtual address value. + + If the memory mapping attribute is set to virtual mapping, the physical address of the memory region + is also automatically assigned.\n + + @note1hang The physical address attribute is explicitly set in the attribute structure only + for memory regions with physical-contiguous-mapped mapping. + + Memory regions are always assigned to memory pools. The pool value specifies the memory pool + that the memory region is assigned to. + + @note1hang If attr is specified as NULL, the memory region is created with default + attribute values (Section @xref{sec:qurt_mem_region_attr_init}). + QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory. + + @datatypes + #qurt_mem_region_t \n + #qurt_size_t \n + #qurt_mem_pool_t \n + #qurt_mem_region_attr_t + + @param[out] region Pointer to the memory region object. + @param[in] size Memory region size (in bytes). If size is not an integral multiple of 4K, + it is rounded up to a 4K boundary. + @param[in] pool Memory pool of the region. + @param[in] attr Pointer to the memory region attribute structure. + + @return + #QURT_EOK -- Memory region successfully created.\n + #QURT_EMEM -- Not enough memory to create region. + #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute. + + @dependencies + None. +*/ +int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_region_delete + Deletes the specified memory region. + + If the caller application creates the memory region, it is removed and the system reclaims its + assigned memory. + + If a different application creates the memory region (and is shared with the caller + application), only the local memory mapping to the region is removed; the system does + not reclaim the memory. + + @datatypes + #qurt_mem_region_t + + @param[in] region Memory region object. + + @returns + #QURT_EOK -- Region successfully deleted. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. +*/ +int qurt_mem_region_delete(qurt_mem_region_t region); + + +/**@ingroup func_qurt_mem_region_attr_get + @xreflabel{sec:mem_region_attr_get} + Gets the memory attributes of the specified message region. + After a memory region is created, its attributes cannot be changed. + + @datatypes + #qurt_mem_region_t \n + #qurt_mem_region_attr_t + + @param[in] region Memory region object. + @param[out] attr Pointer to the destination structure for memory region attributes. + + @return + #QURT_EOK -- Operation successfully performed. \n + Error code -- Failure. + + @dependencies + None. +*/ +int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr); + + +/**@ingroup func_qurt_mem_region_attr_set_type + Sets the memory type in the specified memory region attribute structure. + + The type indicates whether the memory region is local to an application or shared between + applications. + @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in,out] attr Pointer to memory region attribute structure. + @param[in] type Memory type. Values: \n + - #QURT_MEM_REGION_LOCAL \n + - #QURT_MEM_REGION_SHARED @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){ + attr->type = type; +} + +/**@ingroup func_qurt_mem_region_attr_get_size + Gets the memory region size from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] size Pointer to the destination variable for memory region size. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){ + (*size) = attr->size; +} + +/**@ingroup func_qurt_mem_region_attr_get_type + Gets the memory type from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] type Pointer to the destination variable for the memory type. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){ + (*type) = attr->type; +} + +/**@ingroup func_qurt_mem_region_attr_set_physaddr + Sets the memory region 32-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise QuRT automatically sets it + when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){ + attr->ppn = (unsigned)(((unsigned)(addr))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr + Gets the memory region physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for memory region physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_region_attr_set_virtaddr + Sets the memory region virtual address in the specified memory attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_addr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){ + attr->virtaddr = addr; +} + +/**@ingroup func_qurt_mem_region_attr_get_virtaddr + Gets the memory region virtual address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for the memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned int)(attr->virtaddr); +} + +/**@ingroup func_qurt_mem_region_attr_set_mapping + Sets the memory mapping in the specified memory region attribute structure. + + The mapping value indicates how the memory region is mapped in virtual memory. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mapping Mapping. Values: + - #QURT_MEM_MAPPING_VIRTUAL + - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS + - #QURT_MEM_MAPPING_IDEMPOTENT + - #QURT_MEM_MAPPING_VIRTUAL_FIXED + - #QURT_MEM_MAPPING_NONE + - #QURT_MEM_MAPPING_VIRTUAL_RANDOM + - #QURT_MEM_MAPPING_INVALID @tablebulletend + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){ + attr->mapping_type = mapping; +} + +/**@ingroup func_qurt_mem_region_attr_get_mapping + Gets the memory mapping from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mapping Pointer to the destination variable for memory mapping. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){ + (*mapping) = attr->mapping_type; +} + +/**@ingroup func_qurt_mem_region_attr_set_cache_mode + Sets the cache operation mode in the specified memory region attribute structure. + + @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mode Cache mode. Values: \n + - #QURT_MEM_CACHE_WRITEBACK \n + - #QURT_MEM_CACHE_WRITETHROUGH\n + - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n + - #QURT_MEM_CACHE_NONE @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){ + QURT_PGATTR_C_SET(attr->pga, (unsigned)mode); +} + +/**@ingroup func_qurt_mem_region_attr_get_cache_mode + Gets the cache operation mode from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){ + unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga); + (*mode) = (qurt_mem_cache_mode_t)mode_temp; +} + +/**@ingroup func_qurt_mem_region_attr_set_bus_attr + Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure. + + @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] abits The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){ + QURT_PGATTR_A_SET(attr->pga, abits); +} + +/**@ingroup func_qurt_mem_region_attr_get_bus_attr + Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] pbits Pointer to an unsigned integer that is filled in with + the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){ + (*pbits) = QURT_PGATTR_A_GET(attr->pga); +} + +void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle); +void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle); +void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms); +void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms); + +/**@ingroup func_qurt_mem_map_static_query + Determines whether a memory page is statically mapped. + Pages are specified by the following attributes: physical address, page size, cache mode, + and memory permissions. \n + - If the specified page is statically mapped, vaddr returns the virtual + address of the page. \n + - If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + The system configuration file defines QuRT memory maps. + + @datatypes + #qurt_addr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr Physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n + #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + + +/**@ingroup func_qurt_mem_region_query + Queries a memory region. \n + This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. + When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_paddr_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr Physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Query successfully performed. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr); + + +/**@ingroup func_qurt_mapping_create + @xreflabel{hdr:qurt_mapping_create} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Mapping created. \n + #QURT_EMEM -- Failed to create mapping. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove + @xreflabel{hdr:qurt_mapping_remove} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Mapping created. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr + Translates a virtual memory address to the physical memory address to which it maps. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the + physical address of another process. + + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- Physical address to which the virtual address is mapped.\n + 0 -- Virtual address not mapped. + + @dependencies + None. +*/ +qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr); + +/**@ingroup func_qurt_mem_region_attr_set_physaddr_64 + Sets the memory region 64-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise it is automatically set by + QuRT when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr_64 Memory region 64-bit physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){ + attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr_64 + Gets the memory region 64-bit physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr_64 Pointer to the destination variable for the memory region 64-bit physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){ + (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_map_static_query_64 + Determines if a memory page is statically mapped. + The following attributes specify pages: 64-bit physical address, page size, cache mode, + and memory permissions. \n + If the specified page is statically mapped, vaddr returns the virtual + address of the page. + If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + QuRT memory maps are defined in the system configuration file. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr_64 64-bit physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n + #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mem_region_query_64 + Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr_64 64-bit physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64); + +/**@ingroup func_qurt_mapping_create_64 + @xreflabel{hdr:qurt_mapping_create_64} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Failure. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove_64 + @xreflabel{hdr:qurt_mapping_remove_64} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Success. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr_64 + Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical + address of another process. + + @datatypes + #qurt_paddr_64_t \n + #qurt_addr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address has not been mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_mapping_reclaim + Deallocates all QuRT resources associated with the specified virtual + memory area, making it available for user memory management:\n + - The associated physical memory areas are freed and added to the + specified physical pool.\n + - The associated TLB entries are deleted and made available for TLB + management.\n + - The virtual memory area is not freed -- it is left in + place as allocated, but unmapped virtual memory. Access to this + memory area generates an exception.\n + + The virtual memory area must be statically allocated. + If no pool is specified, the freed physical memory is not added to any pool. + + @note1hang The virtual memory area is restricted to being filled with locked + TLB entries that are contiguous within the memory area, and contained by it. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_pool_t + + @param[in] vaddr Virtual address of the memory area to free. + @param[in] vsize Size (in bytes) of the memory area to free. + @param[in] pool Handle to the physical pool where freed physical memory is added. + If set to 0, freed physical memory is not added to any pool. + + @return + 0 -- Success. \n + Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that + QuRT logs messages related to the failure, and callers are free to ignore the return value. + + @dependencies + None. +*/ +int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_mem_configure_cache_partition + Configures the Hexagon cache partition at the system level. + + A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache. + + The L1 cache partition is not supported in Hexagon processor version V60 or greater. + + @note1hang Call this operation only with QuRT OS privilege. + + @datatypes + #qurt_cache_type_t \n + #qurt_cache_partition_size_t + + @param[in] cache_type Cache type for partition configuration. Values: \n + - #HEXAGON_L1_I_CACHE \n + - #HEXAGON_L1_D_CACHE \n + - #HEXAGON_L2_CACHE @tablebulletend + + @param[in] partition_size Cache partition size. Values: \n + - #FULL_SIZE \n + - #HALF_SIZE \n + - #THREE_QUARTER_SIZE \n + - #SEVEN_EIGHTHS_SIZE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Error. + + @dependencies + None. + */ +int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size); + + +/**@ingroup func_qurt_mem_syncht + @xreflabel{hdr:qurt_mem_syncht} + Performs heavy-weight synchronization of memory transactions. + + This operation does not return until all previous memory transactions (cached and uncached load/store, + mem_locked, and so on) that originated from the current thread are complete and globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_syncht(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" SYNCHT \n"); + #endif +} + +/**@ingroup func_qurt_mem_barrier + @xreflabel{hdr:qurt_mem_barrier} + Creates a barrier for memory transactions. + + This operation ensures that all previous memory transactions are globally observable before any + future memory transactions are globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction. + @return + None + + @dependencies + None. + */ +static inline void qurt_mem_barrier(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" BARRIER \n"); + #endif +} +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_system_mem_alloc + Requests that the kernel allocates memory from the kernel-owned pool. + + @param[in] size Size in bytes (aligned to 4K) to allocate. + @param[in] align Any alignment that must be considered for the allocation. + @param[in] flags Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates + available virtual memory in the address space of all processes. + + @return + #QURT_EFATAL -- Allocation failed \n + Start address of the successful allocation. + + @dependencies + None. +*/ +unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags); +/** @endcond */ +/** @cond rest_reg_dist*/ +/**@ingroup func_qurt_lookup_physaddr2 + Translates the virtual memory address of the specified process to the 64-bit + physical memory address to which it is mapped. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[in] vaddr Virtual address. + @param[in] pid PID. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address is not mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid); +/** @endcond */ + +/**@ingroup func_qurt_mapping_attr_get + Gets the mapping attributes for a given virtual address and PID + + @datatypes + #qurt_addr_t \n + #qurt_mapping_attr_t + + @param[in] vaddr virtual address for which the attributes are required. + @param[in] pid process id for the target process + @param[out] attr Pointer to the mapping attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Incorrect virtual address or pid +*/ +int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr); + + +/**@ingroup func_qurt_mapping_attr_get_cache_mode + Gets the cache operation mode in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] cache_mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode) +{ + (*cache_mode) = attr->cache_mode; +} + +/**@ingroup func_qurt_mapping_attr_get_physaddr + Gets the physical memory address in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] physaddr Pointer to the destination variable for physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr) +{ + (*physaddr) = attr->paddr; +} + +/**@ingroup func_qurt_mapping_attr_get_perms + Gets the permissions in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_perm_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] perms Pointer to the destination variable for permissions. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms) +{ + (*perms) = attr->perms; +} + +/**@ingroup func_qurt_mapping_attr_get_size + Gets the size in the specified memory mapping attribute structure.This represents size of the + TLB entry which covers the virtual address. + + + @datatypes + #qurt_mapping_attr_t \n + #unsigned int + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] size Pointer to the destination variable for size. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size) +{ + (*size) = attr->size; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MEMORY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mmap.h new file mode 100755 index 0000000000000..c3bd875910af7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mmap.h @@ -0,0 +1,359 @@ +#ifndef QURT_MMAP_H +#define QURT_MMAP_H +/** + @file qurt_mmap.h + @brief Prototypes of memory mapping/unmapping APIs. + The APIs allow the user to map, un-map, and change permissions + on memory regions. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_mem_mmap + Creates a memory mapping with the specified attributes. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that specifies a pool handle + if the user wants to allocate memory from a specific pool. + The default value for this argument is NULL. + @param[in] pRegion Map region. This argument is unused, and the default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + @param[in] flags Mapping modes.\n + - #QURT_MAP_NAMED_MEMSECTION + - #QURT_MAP_FIXED \n + - #QURT_MAP_NONPROCESS_VPOOL \n + - #QURT_MAP_TRYFIXED \n + - #QURT_MAP_ANON \n + - #QURT_MAP_PHYSADDR \n + - #QURT_MAP_VA_ONLY @tablebulletend + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap2 + Creates a memory mapping with the specified attributes. Returns a more descriptive + error code in case of failure. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that allows the user to specify a pool handle + when the user wants to allocate memory from a specific pool. + Default value for this argument is NULL. + @param[in] pRegion Map region (unused argument); default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode. + @param[in] flags Mapping modes; + Shared, Private, or Anonymous. + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_EMEM -- Physical address is not available. \n + #QURT_EFAILED -- VA is not available or mapping failed.\n + #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA). + */ +void *qurt_mem_mmap2(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap_by_name + Creates a memory mapping for a named-memsection using the specified attributes. + The named memsection should be specified in cust_config.xml. + + @note1hang If the specified attributes are not valid or the named memsection is not found, + an error result is returned. + + @param[in] name Name of the memsection in cust_config.xml that specifies + this mapping. Should be less than 25 characters. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode + @param[in] flags Mapping modes, such as + Shared, Private, or Anonymous. + @param[in] offset Offset relative to the physical address range specified in memsection. + If offset + length exceeds size of memsection, failure is + returned. + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap_by_name(const char* name, + void *addr, + size_t length, + int prot, + int flags, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mprotect2 + Changes access permissions and attributes on an existing mapping based on the client_handle argument. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned. + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping.\n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect2(int client_handle, const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_mprotect + Changes access permissions and attributes on an existing mapping. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned.\n + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect(const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_munmap + Removes an existing mapping. + + @note1hang If the specified mapping is not found in the context of the caller process + or invalid attributes are passed, an error code is returned. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap(void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap2 + Removes an existing mapping for a specified process. + + @note1hang This API allows a root process entity, such as a driver, to remove mapping + that was created for a user process. If the specified mapping is not found in the context + of client handle or invalid attributes are passed, an error code is returned. + + @param[out] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap2(int client_handle, + void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap3 + Removes an existing mapping or reservation for a specified process. + + @param[in] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Pointer to a virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] flags Specifies the flag. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap3(int client_handle, + void *addr, + size_t length, + int flags); + +/* +|| The macros here follow the style of the standard mmap() macros, but with +|| QURT_ prepended to avoid name conflicts, and to avoid having a dependency +|| on sys/mman.h. +|| +|| Wherever possible, any values here that are also present in sys/mman.h +|| should have the same value in both places so that we can accept "mmap" +|| calls without having to remap parameters to new values. +|| +|| In the future, it would be desirable to have a regression test that +|| checks, for instance, that these macros match. Example: +|| +|| assert(QURT_MAP_FAILED == MAP_FAILED); +|| ... repeat as needed ... +*/ + +/** @addtogroup memory_mapping_macros +@{ */ +/** @cond */ +#define QURT_PROT_NONE 0x00U /**< */ +#define QURT_PROT_READ 0x01U /**< */ +#define QURT_PROT_WRITE 0x02U /**< */ +#define QURT_PROT_EXEC 0x04U /**< */ +#define QURT_PROT_NODUMP 0x08U /**< Skip dumping the mapping. During PD dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and the DSP process + crashes before the mapping is removed.*/ +#define QURT_PROT_ISLAND 0x10U /**< Island mapping. */ + +#define QURT_MAP_SHARED 0x0001U /**< Shared. */ +#define QURT_MAP_PRIVATE 0x0002U /**< Private. */ +/** @endcond */ +#define QURT_MAP_NAMED_MEMSECTION 0x0004U /**< Named memsection. */ +#define QURT_MAP_FIXED 0x0010U /**< Fixed virtual address. */ +#define QURT_MAP_RENAME 0x0020U /**< Rename. */ +#define QURT_MAP_NORESERVE 0x0040U /**< No reserve. */ +#define QURT_MAP_INHERIT 0x0080U /**< Inherit. */ +#define QURT_MAP_NONPROCESS_VPOOL 0x0100U /**< Use a virtual address outside of the default range of the + processes. This option is only supported in the root process + and only when virtual memory split is enabled in the XML. + The root process can use this flag to create mapping for a + user process, for example, if the virtual address is configured + for a 3G/1G split, the root process can use this flag to create + mapping in the top 1 GB area for the user process or the + lower 3 GB area for the root process. This is useful for + shared buffer use cases. */ +#define QURT_MAP_HASSEMAPHORE 0x0200U /**< Has semaphore. */ +#define QURT_MAP_TRYFIXED 0x0400U /**< Try to create a mapping for a virtual address that was passed. + If the passed virtual address fails, use a random virtual address. */ +#define QURT_MAP_WIRED 0x0800U /**< Wired. */ +#define QURT_MAP_FILE 0x0000U /**< File. */ +#define QURT_MAP_ANON 0x1000U /**< Allocate physical memory from the pool that was passed. + By default, memory is allocated from the default physpool. */ +#define QURT_MAP_VA_ONLY 0X2000U /**< Reserve a virtual address without + mapping it. */ + +/** @cond */ +#define QURT_MAP_ALIGNED(n) ((n) << QURT_MAP_ALIGNMENT_SHIFT) +#define QURT_MAP_ALIGNMENT_SHIFT 24 + + +#define QURT_MAP_ALIGNMENT_MASK QURT_MAP_ALIGNED(0xff) /**< */ +#define QURT_MAP_ALIGNMENT_64KB QURT_MAP_ALIGNED(16) /**< */ +#define QURT_MAP_ALIGNMENT_16MB QURT_MAP_ALIGNED(24) /**< */ +#define QURT_MAP_ALIGNMENT_4GB QURT_MAP_ALIGNED(32) /**< */ +#define QURT_MAP_ALIGNMENT_1TB QURT_MAP_ALIGNED(40) /**< */ +#define QURT_MAP_ALIGNMENT_256TB QURT_MAP_ALIGNED(48) /**< */ +#define QURT_MAP_ALIGNMENT_64PB QURT_MAP_ALIGNED(56) /**< */ +/** @endcond */ +#define QURT_MAP_FAILED ((void *) -1) /**< Mapping creation failed. */ + +/* +|| The macros below are extensions beyond the standard mmap flags, but follow +|| the style of the mmap flags. +*/ +/** @cond */ +// Describe bitfields in (prot) +#define QURT_PROT_CACHE_BOUNDS 16U,19U,7U /**< Bits 16 through 19 are cache attribute, default is 0. */ +#define QURT_PROT_BUS_BOUNDS 20U,21U,0U /**< Bits 20 through 21 are bus attributes, default is 0. */ +#define QURT_PROT_USER_BOUNDS 22U,23U,3U /**< Bits 22 through 23 are user mode, default is 3; + default of 3 means to derive user mode setting from the + default mode of the client. */ + +// Describe bitfields in (flags) +#define QURT_MAP_PHYSADDR_BOUNDS 15U,15U,0U /**< Bits 15 through 15 are physaddr, default is 0. */ +#define QURT_MAP_TYPE_BOUNDS 16U,19U,0U /**< Bits 16 through 19 are mapping type, default is 0. */ +#define QURT_MAP_REGION_BOUNDS 20U,23U,0U /**< Bits 20 through 23 are region type, default is 0. */ +/** @endcond */ + +// These macros get OR'ed into (prot) +#define QURT_PROT_CACHE_MODE(n) QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_BUS_ATTR(n) QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_USER_MODE(n) QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n)) /**< */ +// These macros get OR'ed into (flags) + +#define QURT_MAP_PHYSADDR QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. + This is allowed only for root process. */ +#define QURT_MAP_TYPE(n) QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_REGION(n) QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n)) /**< */ +/** @} */ /* end_addtogroup memory_mapping_macros */ +/** @cond */ +// These macros extract fields from (prot) +#define QURT_PROT_GET_CACHE_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_BUS_ATTR(n) QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_USER_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n)) /**< */ + +// These macros extract fields from (flags) +#define QURT_MAP_GET_TYPE(n) QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_GET_REGION(n) QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */ + +// Macros for bitfield insertion and extraction +#define QURT_MMAP_MASK(lo,hi) (~((~0u) << ((hi)-(lo)+1U))) /**< Mask of same size as [lo..hi]. */ +#define QURT_MMAP_BUILD_(lo,hi,def,n) ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */ +#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */ +#define QURT_MMAP_BUILD(a,b) QURT_MMAP_BUILD_(a,b) /**< */ +#define QURT_MMAP_EXTRACT(a,b) QURT_MMAP_EXTRACT_(a,b) /**< */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mq.h new file mode 100755 index 0000000000000..580c83d3de41a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mq.h @@ -0,0 +1,458 @@ +#ifndef QURT_MQ_H +#define QURT_MQ_H +/** + @file qurt_mq.h + + @brief Prototypes of secure message queues API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2019-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. +======================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_MQ_NAME_MAXLEN 16U /**< Maximum name length. */ + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/* This enum must be generated in accordance to process class class numbers. + For now it is made to match generated version, do not change this unless + there is a corresponding change in the process_class.py, indicies start from 0 + basically: QURT_MQ_SECURITY_SCOPE_ = (1 << QURTK_process_class_index_) +*/ +typedef enum { + QURT_MQ_SECURITY_SCOPE_KERNEL = ( 1U << 0 ), + QURT_MQ_SECURITY_SCOPE_SRM = ( 1U << 1 ), + QURT_MQ_SECURITY_SCOPE_SECURE = ( 1U << 2 ), + QURT_MQ_SECURITY_SCOPE_CPZ = ( 1U << 3 ), + QURT_MQ_SECURITY_SCOPE_ROOT = ( 1U << 4 ), + QURT_MQ_SECURITY_SCOPE_SIGNED = ( 1U << 5 ), + QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ), + QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 ) +} qurt_mq_security_scope_t; + +typedef enum { + QURT_MQ_CARDINALITY_PTP = (1U << 0), + QURT_MQ_CARDINALITY_MTO = (1U << 1) +}qurt_mq_cardinality_t; + +typedef unsigned int qurt_mqd_t; + +typedef union{ + struct { + unsigned int perms:2; + unsigned int cardinality:1; + unsigned int blocking:1; + + qurt_mq_security_scope_t creator_scope: 8; + qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO + unsigned int queue_closed: 1; + unsigned int reserved: 11; + }; //try to do anonymous struct + unsigned int raw; +} qurt_mq_flags_t; + + +/* permissions are from qurt_types.h , block X though */ +#if 0 +/** Memory access permission. */ +typedef enum { + QURT_PERM_READ=0x1U, /**< */ + QURT_PERM_WRITE=0x2U, /**< */ + QURT_PERM_EXECUTE=0x4U, /**< */ + QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE, /**< */ +} qurt_perm_t; +#endif + +struct qurt_mq_attr { + unsigned flags; /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */ + unsigned mq_maxmsg; /**< Maximum number of messages. Used with create() and get_attr. */ + unsigned short mq_send_msgsize; /**< Maximum size (bytes) of message in receiver facing queue, + from sender to receiver. */ + unsigned short mq_recv_msgsize; /**< Maximum size (bytes) of message in sender facing queue, + from receiver to sender. */ + unsigned client_pid; /**< Process ID of client that is allowed to open the message queue + that was created using qurt_mq_create(). */ + qurt_mq_cardinality_t cardinality; /**< Cardinality of message queue connection, see below. */ + qurt_mq_security_scope_t scope; /**< Security scope of the senders to the queue. */ +}; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mq_attr_init + Initializes attributes to default values used for creating the queue. + + The initialize operation sets the following default attribute values: \n + - flag - QURT_PERM_READ | QURT_PERM_WRITE \n + - maxmsg - 1 \n + - mq_send_msgsize - 8 \n + - mq_recv_msgsize - 8 \n + - sender_pid - -1 \n + - cardinality - QURT_MQ_CARDINALITY_PTP \n + - scope - QURT_MQ_SECURITY_SCOPE_SIGNED \n + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the initialized message queue object. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_init(struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_attr_set_send_msgsize + Sets the message size in bytes the sender can send. + Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_recv_msgsize + Sets the message size in bytes that the receiver can read. + Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_maxmsg + Sets the maximum message that can queue in the message queue. + Message depth is configurable using the XML configuration. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] depth Maximum message that can be queued. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth); + +/**@ingroup qurt_mq_attr_set_scope + Sets the scope of the message queue. A message queue created with a security + scope allows only a process class of that scope to open a message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mq_security_scope_t + + @param[in,out] attr Pointer to the message queue object. + @param[in] scope Scope of the message queue: \n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope); + + +/**@ingroup qurt_mq_attr_set_client_pid + Sets the client_pid that can open this message queue. + If client_pid is set, allowed_scope to open MQ shall not be considered. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] client_pid Valid PID for client process. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid); + +/**@ingroup qurt_mq_attr_set_flags + Sets the properties of the message queues. + The current implementation is only used to set the permission for the message queue using the flag attribute. + Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] flags Permission for message queue. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags); + +/**@ingroup qurt_mq_create + Create a message queue with the provided name and attributes. + The calling process becomes the owner of the queue. + Name of the message queue is limited to 16 characters including the NULL terminator. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue identifier if + the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] attr Pointer to the initialized message queue attribute + structure that specifies the attributes of the created message queue. + + @return + #QURT_EOK Message queue created. \n + #QURT_EINVALID Invalid arguments. \n + #QURT_ENOSPC Maximum number of queues in the system is exceeded. + + @dependencies + None. +*/ +int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_open + Opens a message queue connection between a process and a created message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue + identifier if the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] flags Flag that contains the properties that define the behavior of message queue connection. + Permissions:\n + #QURT_PERM_READ \n + #QURT_PERM_WRITE \n + #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend + Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n + Cardinality: \n + #QURT_MQ_CARDINALITY_PTP (default) \n + #QURT_MQ_CARDINALITY_MTO (not implemented) \n + Block suspend thread until the message queue with the apecified name is created. \n + Scope: security boundary to which the message queue and its users are constrained. + Block suspend thread until the message queue with the apecified name is created. \n + It is coupled with process privilege level/scope.\n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend + + @return + QURT_EOK -- Message queue connection successfully opened \n + QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n + QURT_ENOTALLOWED -- Open failed due to security scope mismatch + + @dependencies + None. +*/ +int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags); + +/**@ingroup qurt_mq_send + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send shall resume that thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] msg_len Length of the message buffer in bytes. + + @return + #QURT_EOK Message queue send was successful.\n + #QURT_EMSGSIZE Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED Send failed due to security scope mismatch. + + @dependencies + None. +*/ +int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); + +/**@ingroup qurt_mq_send_timed + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message or until timeout is reached. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] duration Interval (in microseconds) that the duration value must be + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in] msg_len Length of message buffer in bytes. + + @return + #QURT_EOK -- Message queue send was successful. \n + #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED -- Send failed due to security scope mismatch \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. +*/ +int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len); + + /**@ingroup qurt_mq_recv + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue. \n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv shall resume the thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in,out] msg_len Pointer to the length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID Message pointer or msg_len ptr are NULL. \n + #QURT_EBADR Message queue descriptior (mqd) is invalid. \n + #QURT_EBADF Sender closed the message queue. + + @dependencies + None. +*/ +int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len); + + /**@ingroup qurt_mq_recv_timed + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue or until timeout is reached.\n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in] duration Interval (in microseconds) that the duration value must be; + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in,out] msg_len Pointer to length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID -- Message ptr or msg_len ptr are NULL. \n + #QURT_EBADR -- Message queue descriptior (mqd) is invalid.\n + #QURT_EBADF -- Sender closed the message queue. \n + #QURT_ETIMEDOUT -- Timeout. + + @dependencies + None. +*/ +int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len); + + /**@ingroup qurt_mq_close + Closes the message queue and disassociates the calling process (client) from the message queue + under this descriptor. Marks the queue as closed for the receiver. + This function is expected to be called from the client side. If called + from the server side, the function reduces to no-op and returns success. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue close was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_close(qurt_mqd_t mqd); + + /**@ingroup qurt_mq_destroy + Destroys the message queue. This function ought to be + called from the process that called qurt_mq_create(). + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue destroy was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_destroy(qurt_mqd_t mqd); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif //QURT_MQ_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mutex.h new file mode 100755 index 0000000000000..4ad6b270cdde6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mutex.h @@ -0,0 +1,211 @@ +#ifndef QURT_MUTEX_H +#define QURT_MUTEX_H +/** + @file qurt_mutex.h + @brief Prototypes of mutex API. + This is mostly a user space mutex, but calls the + kernel to block if the mutex is taken. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT mutex type. + + Both non-recursive mutex lock and unlock, and recursive + mutex lock and unlock can be applied to this type. + */ +typedef union qurt_mutex_aligned8{ + /** @cond */ + struct { + unsigned int holder; + unsigned int count; + unsigned int queue; + unsigned int wait_count; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_mutex_t; +/** @} */ /* end_addtogroup mutex_types */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* @addtogroup mutex_const_macros +@{ */ +#define MUTEX_MAGIC 0xfe /**< */ +#define QURTK_FUTEX_FREE_MAGIC 0x1F // 11111 /**< */ +#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}} /**< Suitable as an initializer for a + variable of type qurt_mutex_t. */ +/* @} */ /* end_addtogroup mutex_const_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mutex_init + Initializes a mutex object. + The mutex is initially unlocked. + + @note1hang Each mutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_mutex_destroy() + when this object is not used anymore + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the mutex object. Returns the initialized object. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_destroy + Destroys the specified mutex. + + @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_lock + Locks the specified mutex. + If a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. + + @note1hang A thread is suspended indefinitely if it locks a mutex that it has already + locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}). + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_lock(qurt_mutex_t *lock); /* blocking */ + +/**@ingroup func_qurt_mutex_lock_timed + Locks the specified mutex. + When a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + When a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. If the duration of suspension exceeds the timeout duration, wait is + terminated and no access to mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object; specifies the mutex to lock. + @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration); + +/**@ingroup func_qurt_mutex_unlock + Unlocks the specified mutex. \n + More than one thread can be suspended on a mutex. When the mutex is unlocked, only the + highest-priority thread waiting on the mutex is awakened. If the awakened thread has + higher priority than the current thread, a context switch occurs. + + @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first + lock. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to unlock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_unlock(qurt_mutex_t *lock); /* unlock */ + +/**@ingroup func_qurt_mutex_try_lock + @xreflabel{hdr:qurt_mutex_try_lock} + Attempts to lock the specified mutex. + If a thread performs a try_lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + @note1hang If a thread performs a try_lock operation on a mutex that it has already locked + or is in use by another thread, qurt_mutex_try_lock immediately returns with a + nonzero result value. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_mutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_os_services.h new file mode 100755 index 0000000000000..cbc4c239e9620 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_os_services.h @@ -0,0 +1,24 @@ +/*============================================================================= + + qurt_os_services.c + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ + +#define QURT_OS_SERVICE_THREAD "/os/thread" /**< Thread service */ +#define QURT_OS_SERVICE_FS_HUB "/os/fs_hub" /**< file-system hub */ +#define QURT_OS_SERVICE_CALLBACK "/os/callback" /**< QDI callback service */ +#define QURT_OS_SERVICE_INTERRUPTS "/os/interrupt" /**< Interrupt service */ +#define QURT_OS_SERVICE_PROXY "/os/proxy" /**< QDI proxy serice */ +#define QURT_OS_SERVICE_MEMORY "/os/memory" /**< Memory management service */ +#define QURT_OS_SERVICE_MEMPOOL "/os/mempool" /**< Pool management service */ +#define QURT_OS_SERVICE_PROCESS "/os/process" /**< Process management service */ +#define QURT_OS_SERVICE_MMAP "/os/mem_mapper" /**< mmapper service */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex.h new file mode 100755 index 0000000000000..61aee5cba7ce8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_PIMUTEX_H +#define QURT_PIMUTEX_H 1 +/** + @file qurt_pimutex.h + @brief Prototypes of qurt_pimutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pimutex_init + Initializes a priority inheritance mutex object. + The priority inheritance mutex is initially unlocked. + + This function works the same as qurt_mutex_init(). + + @note1hang Each pimutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_pimutex_destroy() + when this object is not used anymore + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the priority inheritance mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_destroy + Destroys the specified priority inheritance mutex. + + @note1hang Priority inheritance mutexes must be destroyed when they are no longer in + use. Failure to do this causes resource leaks in the QuRT kernel.\n + @note1cont Priority inheritance mutexes must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_lock + Requests access to a shared resources. If a thread performs a lock operation on a mutex + that is not in use, the thread gains access to the shared resource that the mutex protects, + and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + If a thread is suspended on a priority inheritance mutex, and the priority of the suspended + thread is higher than the priority of the thread that has locked the mutex, the thread + with the mutex acquires the higher priority of the suspended thread. The locker thread blocks + until the lock is available. + + @note1hang A thread is not suspended if it locks a priority inheritance mutex that it has + already locked . However, the mutex does not become available to other + threads until the thread performs a balanced number of unlocks on the mutex.\n + @note1cont When multiple threads compete for a mutex, the lock operation for a priority + inheritance mutex is slower than it is for a recursive mutex. + In particular, it is about 10 times slower when the mutex is available for locking, + and slower (with greatly varying times) when the mutex is already locked. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_lock(qurt_mutex_t *lock); + + +/**@ingroup func_qurt_pimutex_lock_timed + Locks a priority inheritance mutex with timeout. + + A thread can lock a priority inheritance mutex for multiple times. The mutex is not + available to other threads until the thread performs the same number of mutex unlock + operations. + + If a thread performs a lock operation on a mutex that is already locked by another thread, + the thread is moved to waiting state. When the mutex becomes available again (because the + other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex. + + If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread + is higher than the priority of the thread that has locked the mutex, the priority of the thread + that has locked the mutex is raised to the same priority of the waiting thread. + + If the duration of waiting exceeds the timeout duration, the waiting is terminated, and + the function returns QURT_ETIMEDOUT as a failure of the mutex lock. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to lock. + @param[in] duration Duration (in microseconds) to wait. The duration value must be between + #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + #QURT_EINVALID -- Duration is out of range + + @dependencies + None. + + */ +int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + + +/**@ingroup func_qurt_pimutex_unlock + Releases access to a shared resource; unlocks the specified priority inheritance mutex. \n + More than one thread can be suspended on a priority inheritance mutex. When the mutex + is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + When a thread unlocks a priority inheritance mutex, its thread priority is restored to its + original value from any higher priority value that it acquired from another thread + suspended on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_try_lock + Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n + If a thread performs a try_lock operation on a priority inheritance mutex that is not in + use, the thread gains access to the shared resource that is protected by the mutex, and + continues executing. + If a thread performs a try_lock operation on a priority inheritance mutex that is already + in use by another thread, qurt_pimutex_try_lock immediately returns with a + nonzero result value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_pimutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex2.h new file mode 100755 index 0000000000000..b809f163cbfd2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex2.h @@ -0,0 +1,162 @@ +#ifndef QURT_PIMUTEX2_H +#define QURT_PIMUTEX2_H +/** + @file qurt_pimutex2.h + @brief Prototypes of pimutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_pimutex2_init + Initializes a recursive mutex object. + + @deprecated use #qurt_pimutex_init instead. + + The recursive mutex is initially unlocked. + + Objects of type pimutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_destroy + + @deprecated use #qurt_pimutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1cont Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code should destroy an pimutex2 object prior to + deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures + that all qurt_pimutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_lock + + @deprecated use #qurt_pimutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not being used, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing. + + If a thread performs a lock operation on a recursive mutex that is already being used by + another thread, the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_unlock + + @deprecated use #qurt_pimutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_pimutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_pimutex2_lock(). If a call to qurt_pimutex2_lock() would + succeed immediately, this function behaves similarly, and returns 0 for success. + If a call to qurt_pimutex2_lock() would not succeed immediately, this function has + no effect and returns non-zero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pipe.h new file mode 100755 index 0000000000000..6bdaa044f8640 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pipe.h @@ -0,0 +1,479 @@ +#ifndef QURT_PIPE_H +#define QURT_PIPE_H +/** + @file qurt_pipe.h + @brief Prototypes of the pipe interface API + This is a pipe or message queue + It blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup pipe_types +@{ */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_PIPE_MAGIC 0xF1FEF1FE /**< Magic. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */ + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** QuRT pipe data values type. */ +typedef unsigned long long int qurt_pipe_data_t; + +/** QuRT pipe type.*/ +typedef struct { + /** @cond */ + qurt_mutex_t pipe_lock; + qurt_sem_t senders; + qurt_sem_t receiver; + unsigned int size; + unsigned int sendidx; + unsigned int recvidx; + void (*lock_func)(qurt_mutex_t *); + void (*unlock_func)(qurt_mutex_t *); + int (*try_lock_func)(qurt_mutex_t *); + void (*destroy_lock_func)(qurt_mutex_t *); + unsigned int magic; + qurt_pipe_data_t *data; + /** @endcond */ +} qurt_pipe_t; + +/** QuRT pipe attributes type. */ +typedef struct { + /** @cond */ + qurt_pipe_data_t *buffer; + unsigned int elements; + unsigned char mem_partition; + /** @endcond */ +} qurt_pipe_attr_t; + +/** @} */ /* end_addtogroup pipe_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pipe_attr_init + @xreflabel{hdr:qurt_pipe_attr_init} + Initializes the structure that sets the pipe attributes when a pipe is created. + + After an attribute structure is initialized, the individual attributes in the structure are + explicitly set using the pipe attribute operations. + + The attribute structure is assigned the following default values: \n + - buffer -- 0 \n + - elements -- 0 \n + - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr) +{ + attr->buffer = NULL; + attr->elements = 0; + attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer + @xreflabel{sec:qurt_pipe_attr_set_buffer} + Sets the pipe buffer address attribute.\n + Specifies the base address of the memory area to use for the data buffer of a pipe. + + The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the + memory area used as a pipe data buffer. The user is responsible for allocating the + memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t \n + #qurt_pipe_data_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] buffer Pointer to the buffer base address. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer) +{ + attr->buffer = buffer; +} + +/**@ingroup func_qurt_pipe_attr_set_elements + @xreflabel{sec:qurt_pipe_attr_set_elements} + Specifies the length of the memory area to use for the data buffer of a pipe. + + The length is expressed in terms of the number of 64-bit data elements that + can be stored in the buffer. + + The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify + the memory area used as a pipe data buffer. The user is responsible for + allocating the memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] elements Pipe length (64-bit elements). + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements) +{ + attr->elements = elements; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer_partition + @xreflabel{sec:qurt_pipe_attr_set_buffer_partition} + Specifies the memory type where a pipe's buffer is allocated. + Allocate pipes in RAM or TCM/LPM. + + @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created + with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] mem_partition Pipe memory partition. Values: \n + - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n + - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition) +{ + attr->mem_partition = mem_partition; +} + +/**@ingroup func_qurt_pipe_create + Creates a pipe.\n + Allocates a pipe object and its associated data buffer, and initializes the pipe object. + + @note1hang The buffer address and size stored in the attribute structure specify how the + pipe data buffer is allocated. + + @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created + using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the created pipe object. + @param[in] attr Pointer to the attribute structure used to create the pipe. + + @return + #QURT_EOK -- Pipe created. \n + #QURT_EFAILED -- Pipe not created. \n + #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM. + + @dependencies + None. + */ +int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_init + Initializes a pipe object using an existing data buffer. + + @note1hang The buffer address and size stored in the attribute structure must + specify a data buffer that the user has already allocated. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the pipe object to initialize. + @param[in] attr Pointer to the pipe attribute structure used to initialize the pipe. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Failure. + + @dependencies + None. + */ +int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_destroy + @xreflabel{sec:qurt_pipe_destroy} + Destroys the specified pipe. + + @note1hang Pipes must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel. + Pipes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_destroy(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_delete + Deletes the pipe.\n + Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its + associated data buffer. + + @note1hang Delete pipes only if they were created using qurt_pipe_create + (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n + @note1cont Pipes must be deleted when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Pipes must not be deleted while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_delete(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_send + Writes a data item to the specified pipe. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + None. + + @dependencies + None. +*/ +void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_receive + Reads a data item from the specified pipe. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + Integer containing the 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_try_send + Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n + + If a thread writes to a full pipe, the operation returns immediately with success set to -1. + Otherwise, success is always set to 0 to indicate a successful write operation. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + 0 -- Success. \n + -1 -- Failure (pipe full). + + @dependencies + None. +*/ +int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_try_receive + Reads a data item from the specified pipe (without suspending the thread if the pipe is + empty).\n + If a thread reads from an empty pipe, the operation returns immediately with success set + to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n + + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[out] success Pointer to the operation status result. + + @return + Integer containing a 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success); + +/**@ingroup func_qurt_pipe_receive_cancellable + Reads a data item from the specified pipe (with suspend), cancellable. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + The operation is cancelled if the user process of the calling thread is killed, + or if the calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] result Pointer to the integer containing the 64-bit data item from pipe. + + @return + #QURT_EOK -- Receive completed. \n + #QURT_ECANCEL -- Receive canceled. \n + #QURT_EDESTROY -- Receive destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result); + +/**@ingroup func_qurt_pipe_send_cancellable + @xreflabel{hdr:qurt_pipe_send_cancellable} + Writes a data item to the specified pipe (with suspend), cancellable. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + The operation is canceled if the user process of the calling thread is killed, or if the + calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] data Data item to write. + + @return + #QURT_EOK -- Send completed. \n + #QURT_ECANCEL -- Send canceled. \n + #QURT_EDESTROY -- Send destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_is_empty + Returns a value indicating whether the specified pipe contains any data. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + 1 -- Pipe contains no data. \n + 0 -- Pipe contains data. + + @dependencies + None. +*/ +int qurt_pipe_is_empty(qurt_pipe_t *pipe); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIPE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmem_manager.h new file mode 100755 index 0000000000000..8c8da985228b9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmem_manager.h @@ -0,0 +1,82 @@ +#ifndef QURT_PMEM_MANAGER_H +#define QURT_PMEM_MANAGER_H +/** + @file qurt_pmem_manager.h + Prototypes of kernel physical memory manager APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* physical memory API return error code */ +#define QURT_PMEM_SUCCESS 0 +#define QURT_PMEM_NO_PRIV 1 +#define QURT_PMEM_RETRY 2 +#define QURT_PMEM_OVERLAP 3 +#define QURT_PMEM_NOT_EXIST 4 +#define QURT_PMEM_INIT_FAILURE 5 +#define QURT_PMEM_OUTSTANDING_MAPPING 6 +#define QURT_PMEM_GENERIC_FAILURE 7 +#define QURT_PMEM_ENTRY_FOUND 8 +#define QURT_PMEM_REACH_END 9 +#define QURT_PMEM_UNCLAIMED 10 +#define QURT_PMEM_ALREADY_CLAIMED 11 + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_pmem_acquire + Acquire the ownership of a specific physical memory region. + + @note1hang The ownership will be the caller + + @param[in] ppage Starting physical page number + @param[in] pnum Number of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. +*/ +int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum); + +/**@ingroup func_qurt_pmem_release + Release the ownership of a specific physical memory region. + + @param[in] ppage The start of physical page number + @param[in] pnum The numbers of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_NOT_EXIST -- The physical memory range is not usable. \n + #QURT_PMEM_OUTSTANDING_MAPPING -- There is outstanding mapping in this range + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. + */ +int qurt_pmem_release(unsigned int ppage, unsigned int pnum); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMEM_MANAGER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmu.h new file mode 100755 index 0000000000000..73ea8eba04abf --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmu.h @@ -0,0 +1,121 @@ +#ifndef QURT_PMU_H +#define QURT_PMU_H +/** + @file qurt_pmu.h + Prototypes of pipe interface API. + A pipe or message queue blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_pmu_set + Sets the value of the specified PMU register. + + @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0 + through PMUCNT3. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @param[in] reg_value Register value. + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_set (int reg_id, unsigned int reg_value); + +/**@ingroup func_qurt_pmu_get + Gets the PMU register.\n + Returns the current value of the specified PMU register. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @return + Integer -- Current value of the specified PMU register. + + @dependencies + None. + */ +unsigned int qurt_pmu_get (int reg_id); + +/**@ingroup func_qurt_pmu_enable + Enables or disables the Hexagon processor PMU. + Profiling is disabled by default. + + @note1hang Enabling profiling does not automatically reset the count registers -- this must + be done explicitly before starting event counting. + + @param[in] enable Performance monitor. Values: \n + - 0 -- Disable performance monitor \n + - 1 -- Enable performance monitor @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_enable (int enable); + +/**@ingroup func_qurt_pmu_get_pmucnt + Reads PMU counters in a single trap. + + @param[out] buf Pointer to a buffer to save values read from PMU counters. + buffer size should be at least 32 bytes to read all eight PMU counters. + + @return + #QURT_EOK -- Successful read.\n + #QURT_EFATAL -- Failure. + + @dependencies + None. + */ +int qurt_pmu_get_pmucnt (void * buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMU_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_power.h new file mode 100755 index 0000000000000..2ee4d29a73976 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_power.h @@ -0,0 +1,140 @@ +#ifndef QURT_POWER_H +#define QURT_POWER_H +/** + @file qurt_power.h + @brief Prototypes of power API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/*============================================================================= + + EDIT HISTORY FOR MODULE + + This section contains comments describing changes made to the module. + Notice that changes are listed in reverse chronological order. + + +when who what, where, why +-------- --- ------------------------------------------------------------ +03/03/11 op Add header file +12/12/12 cm (Tech Pubs) Edited/added Doxygen comments and markup. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +/**@ingroup func_qurt_power_shutdown_fail_exit + Returns from Power Collapse mode when power collapse cannot proceed. + + This function unmasks the global interrupt. This operation is used only when the thread is + recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}). + + @return + #QURT_EOK -- Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_fail_exit qurt_power_exit + +/**@ingroup func_qurt_power_shutdown_exit + Undoes state changes made preparing for power collapse.\n + This function unmasks the global interrupts. + + @return + #QURT_EOK --Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_exit qurt_power_exit +/**@endcond */ + +/**@ingroup func_qurt_system_ipend_get + Gets the IPEND register.\n + + @note1hang Returns the current value of the Hexagon processor IPEND register. The return value + is a mask value that identifies the individual interrupts that are pending. \n + + @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A + mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the + corresponding interrupt is not pending. \n + + @return + Return the IPEND register value. + + @dependencies + None. + */ +unsigned int qurt_system_ipend_get (void); + + +/**@ingroup func_qurt_system_vid_get + Gets the VID register. \n + + @note1hang Returns the current value of the Hexagon processor VID register. The return value is + the vector number of a second-level interrupt that has been accepted by the Hexagon + processor core.\n + + @return + Return the VID register value that is the L2 VIC interrupt number accepted by the processor. + Valid range is 0 to 1023. + + @dependencies + None. + */ +unsigned int qurt_system_vid_get(void); + +/**@ingroup func_qurt_power_shutdown_get_pcycles + Gets the number of power collapses and processor cycles for entering and exiting most recent + power collapse. + + @note1hang If no power collapse has occured yet, processor cycle numbers are zero. + + @param[out] enter_pcycles Number of processor cycles for entering most + recent power collapse. + @param[out] exit_pcycles Number of processor cycles for exiting most + recent power collapse. + @return + Zero -- No power collapses have occurred. \n + Nonzero -- Number of power collapses that have occurred since + the processor was reset. + + @dependencies + None. + */ +int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles, unsigned long long *exit_pcycles ); + +/**@ingroup func_qurt_system_tcm_set_size + Set size of TCM to save during full power collapse. + + @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in + XML, the size is truncated to the size defined in XML. + + @param[in] new_size Size of TCM to save. + + @return + Zero -- Size successfully set \n + -1 -- Size of 0 passed + + @dependencies + None. + */ +int qurt_system_tcm_set_size(unsigned int new_size); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_POWER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_printf.h new file mode 100755 index 0000000000000..a775d8a815918 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_printf.h @@ -0,0 +1,44 @@ +#ifndef QURT_PRINTF_H +#define QURT_PRINTF_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + @file qurt_printf.h + Prototypes of printf API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup chapter_function_tracing +@{ */ + +int qurt_printf(const char* format, ...); + +int qurt_vprintf(const char* format, va_list args); + +/** @} */ /* end_addtogroup chapter_function_tracing */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PRINTF_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_process.h new file mode 100755 index 0000000000000..0df9ddc2d4a70 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_process.h @@ -0,0 +1,995 @@ +#ifndef QURT_PROCESS_H +#define QURT_PROCESS_H +/** + @file qurt_process.h + @brief Prototypes of QuRT process control APIs. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_callback.h" +#include "qurt_consts.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup process_types +@{ */ +#define QURT_PROCESS_ATTR_NAME_MAXLEN QURT_MAX_NAME_LEN /**< Maximum length of the process name. */ +#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN 128 /**< Maximum length of the path of binary/ELF for this process. */ +#define QURT_PROCESS_ATTR_CAP_MAXLEN 128 /**< Maximum length for a resource name. */ + +/** QuRT process capability wildcard strings */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL "ALLOW_ALL" /**< Capability wild-card for full access */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE "ALLOW_NONE" /**< Capability wild-card for no access */ + +/** QuRT process capability states */ +#define QURT_PROCESS_ATTR_CAP_ENABLED 0x1 /**< Capability enabled*/ +#define QURT_PROCESS_ATTR_CAP_DISABLED 0x0 /**< Capability disabled*/ + +/* QuRT process thread attributes. */ +#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0 /**< Default ceiling priority of the threads in the new process. */ +#define QURT_PROCESS_DEFAULT_MAX_THREADS -1 /**< Default number of threads in the new process. + -1 indicates that the limit is set to the maximum supported by the system. */ + +/* QuRT process flags. */ +#define QURT_PROCESS_SUSPEND_ON_STARTUP (1U) /**< Suspend the new processes just before calling main(). */ +#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */ +#define QURT_PROCESS_ISLAND_RESIDENT (1u << 2) /**< Process is island resident. */ +#define QURT_PROCESS_RESTARTABLE (1u << 3) /**< Indicates that the process is restartable */ +#define QURT_PROCESS_UNTRUSTED (1u << 7) /**< Starts the new process as unsigned process. */ + +/* QuRT process debugging session status.*/ +#define QURT_DEBUG_NOT_START 0 /**< Debug is not started. */ +#define QURT_DEBUG_START 1 /**< Debug has started. */ + +/** Process Suspend Options */ +#define QURT_PROCESS_SUSPEND_DEFAULT 0 + +/** Process Resume Options */ +#define QURT_PROCESS_RESUME_DEFAULT 0 + + +/* QuRT process types. */ +typedef enum { + QURT_PROCESS_TYPE_RESERVED, /**< Process type is reserved. \n */ + QURT_PROCESS_TYPE_KERNEL, /**< Kernel process. \n*/ + QURT_PROCESS_TYPE_SRM, /**< SRM process. \n*/ + QURT_PROCESS_TYPE_SECURE, /**< Secure process. \n*/ + QURT_PROCESS_TYPE_ROOT, /**< Root process. \n*/ + QURT_PROCESS_TYPE_USER, /**< User process. */ +}qurt_process_type_t; + +/** QuRT process callback types. */ +typedef enum { + QURT_PROCESS_DUMP_CB_ROOT, /**< Register the callback that executes in the + root process context. \n */ + QURT_PROCESS_DUMP_CB_ERROR, /**< Register the user process callback that is + called after threads in the process are frozen. \n */ + QURT_PROCESS_DUMP_CB_PRESTM, /**< Register the user process callback that is + called before threads in the process are frozen. \n*/ + QURT_PROCESS_DUMP_CB_MAX /**< Reserved for error checking. */ +}qurt_process_dump_cb_type_t; + +/** QuRT process dump attributes. */ +typedef struct _qurt_pd_dump_attr{ + /** @cond */ + unsigned int enabled; /**< Process dump is enabled. */ + const char *path; /**< Process dump path. */ + unsigned int path_len; /**< Length of process dump path. */ + /** @endcond */ +}qurt_pd_dump_attr_t; + +/** QuRT process capability resource type */ +enum qurt_process_cap_type_t { + QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0, /**< Number of entries in the capability structure*/ + QURT_PROCESS_CAP_TYPE_DRIVER=1, /**< Driver resource */ + QURT_PROCESS_CAP_TYPE_MAX /**< Maximum identifier */ +}; + +/** QuRT process capability structure */ +typedef struct _qurt_capability { + enum qurt_process_cap_type_t type; /**< Resource type */ + char name[QURT_PROCESS_ATTR_CAP_MAXLEN]; /**< Resource name*/ + unsigned long long cap; /**< Capabilities allowed for this resource */ +}qurt_capability_t; + +/** QuRT process attributes. */ +typedef struct _qurt_process_attr { + /** @cond */ + char name[QURT_PROCESS_ATTR_NAME_MAXLEN]; /**< Name of the new process. */ + char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the binary for the new process. */ + char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the DTB ELF for the new process. */ + int flags; /**< Flags as indicated by QuRT process flags. */ + unsigned int sw_id; /**< Software ID of the process be load. */ + unsigned sid; /**< Stream ID of the process being spawned. */ + unsigned max_threads; /**< Maximum number of threads that the new process can create. */ + unsigned short ceiling_prio; /**< Maximum priority at which threads can be + created by new process. */ + qurt_process_type_t type; /**< Process type as indicated by + #qurt_process_type_t. */ + qurt_pd_dump_attr_t dump_attr; /**< Process dump attributes for the new process + as indicated by #qurt_pd_dump_attr_t. */ + qurt_capability_t *capabilities; /**< Pointer to array of structure of type + qurt_capability_t */ + /** @endcond */ +} qurt_process_attr_t; + +/** @} */ /* end_addtogroup process_types */ + +/*============================================================================= +FUNCTIONS +=============================================================================*/ + /** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_create + Creates a process with the specified attributes, and starts the process. + + The process executes the code in the specified executable ELF file. + + @datatypes + #qurt_process_attr_t + + @param[out] attr Accepts an initialized process attribute structure, which specifies + the attributes of the created process. + + @return + Postive return value Indicates Process ID. + Negative return value Indicates any of follwoing error, + #-QURT_EPRIVILEGE -- Caller does not have privilege for this operation \n + #-QURT_EMEM -- Not enough memory to perform the operation \n + #-QURT_EFAILED -- Operation failed \n + #-QURT_ENOTALLOWED -- Operation not allowed \n + #-QURT_ENOREGISTERED -- Not registered \n + #-QURT_ENORESOURCE -- Resource exhaustion \n + #-QURT_EINVALID -- Invalid argument value + #QURT_EFATAL -- attr is NULL + + @dependencies + None. +*/ +int qurt_process_create (qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_get_id + Returns the process identifier for the current thread. + + @return + None. + + @dependencies + Process identifier for the current thread. +*/ +int qurt_process_get_id (void); +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_get_uid + Returns the user identifier for the current thread. + + @return + None. + + @dependencies + User identifier for the current thread. +*/ +int qurt_process_get_uid (void); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_init + Initializes the structure that sets the process attributes when a thread is created. + + After an attribute structure is initialized, the individual attributes in the structure can + be explicitly set using the process attribute operations. + + Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize + operation. + + @inputov{table_process_attribute_defaults} + + @datatypes + #qurt_process_attr_t + + @param[out] attr Pointer to the structure to initialize. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_process_attr_init (qurt_process_attr_t *attr) +{ + attr->name[0] = '\0'; + attr->path[0] = '\0'; + attr->dtb_path[0] = '\0'; + attr->flags = 0; + attr->sw_id = 0; + attr->sid = 0; + attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS; + attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO; + attr->type = QURT_PROCESS_TYPE_RESERVED; + attr->dump_attr.enabled = 0; + attr->dump_attr.path = NULL; + attr->dump_attr.path_len = 0; + attr->capabilities = NULL; +} + +/**@ingroup func_qurt_process_attr_set_executable + Sets the process name in the specified process attribute structure. + + Process names identify process objects that are already + loaded in memory as part of the QuRT system. + + @note1hang Process objects are incorporated into the QuRT system at build time. + + @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] name Pointer to the process name. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name); + +/**@ingroup func_qurt_process_attr_set_binary_path + Sets the binary path for the process loading in the specified process attribute structure. + + Path specifies the binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_dtb_path + Sets the DTB binary path for the process loading in the specified process attribute structure. + + Path specifies the DTB binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_flags +Sets the process properties in the specified process attribute structure. +Process properties are represented as defined symbols that map into bits +0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing +together the individual property symbols. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical. + This attribute will be used by error services, + to decide whether to kill user pd or whole subsystem. + QURT_PROCESS_ISLAND_RESIDENT Process will be marked as island resident. + QURT_PROCESS_RESTARTABLE Process will be marked as restartable. + QURT_PROCESS_UNTRUSTED Process will be marked as unsigned process. +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags) +{ + attr->flags = flags; +} +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_sid +Sets the process streamID in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sid streamID to set for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid) +{ + attr->sid = sid; +} +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_set_max_threads +Sets the maximum number of threads allowed in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] max_threads Maximum number of threads allowed for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads) +{ + attr->max_threads = max_threads; +} + +/**@ingroup func_qurt_process_attr_set_sw_id +Sets the software ID of the process to load in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sw_id Software ID of the process, used in authentication. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id) +{ + attr->sw_id = sw_id; +} + +/**@ingroup func_qurt_process_attr_set_ceiling_prio +Sets the highest thread priority allowed in the specified process attribute structure. +Refer qurt_thread.h for priority ranges. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] prio Priority. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio) +{ + attr->ceiling_prio = prio; +} +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_dump_status +Sets the process domain dump-enabled field in the process domain dump attributes. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] enabled 1 -- Process domain dump is collected \n + 0 -- Process domain dump is not collected + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled) +{ + attr->dump_attr.enabled = enabled; +} + +/**@ingroup func_qurt_process_attr_set_dump_path +Sets the process domain dump path and type. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] path Path where the process domain dumps must be saved. +@param[in] path_len Length of the path string. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len) +{ + attr->dump_attr.path = path; + attr->dump_attr.path_len = (unsigned int)path_len; +} + +/**@ingroup func_qurt_process_attr_set_capabilities +Sets list of capabilities available to this process. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] capabilities Pointer to array of structures of type qurt_capability_t defining + resources and capabilites + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities) +{ + attr->capabilities = capabilities; +} + +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_cmdline_get +Gets the command line string associated with the current process. +The Hexagon simulator command line arguments are retrieved using +this function as long as the call is made +in the process of the QuRT installation, and with the +requirement that the program runs in a simulation environment. + +If the function modifies the provided buffer, it zero-terminates +the string. It is possible that the function does not modify the +provided buffer, so the caller must set buf[0] to a NULL +byte before making the call. A truncated command line is returned when +the command line is longer than the provided buffer. + +@param[in] buf Pointer to a character buffer that must be filled in. +@param[in] buf_siz Size (in bytes) of the buffer pointed to by the buf argument. + +@return +None. + +@dependencies +None. +*/ +void qurt_process_cmdline_get(char *buf, unsigned buf_siz); + +/**@ingroup func_qurt_process_get_thread_count +Gets the number of threads present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of threads in the process indicated by PID, if positive value is obtained +Negative error code if failed include: + QURT_EFATAL - Invalid PID + -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID + +@dependencies +None. +*/ +int qurt_process_get_thread_count(unsigned int pid); + +/**@ingroup func_qurt_process_get_thread_ids +Gets the thread IDs for a process indicated by PID. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a user passed buffer that must be filled in with thread IDs. +@param[in] thread_num Number of thread IDs requested. + +@return +#QURT_EOK - Success +#QURT_EFATAL - Failed, ptr is NULL + +@dependencies +None. + */ +int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num); +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_dump_get_mem_mappings_count +Gets the number of mappings present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of mappings for the process indicated by the PID. + +@dependencies +None. +*/ +int qurt_process_dump_get_mem_mappings_count(unsigned int pid); + +/**@ingroup func_qurt_process_dump_get_mappings +Gets the mappings for a specified PID. + +@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a buffer that must be filled in with mappings. +@param[in] count Count of mappings requested. + +@return +Number of mappings filled in the buffer passed by the user. + +@dependencies +None. +*/ +int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_get +Gets the attributes of the process with which it was created. + +@datatypes +#qurt_process_attr_t + +@param[in] pid PID of the process for which the information is required. +@param[in,out] attr Pointer to the user allocated attribute structure. + +@return +#QURT_EOK - Success +#QURT_INVALID - Invalid PID +#QURT_EFATAL - attr is NULL + +@dependencies +None. +*/ +int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_dump_register_cb +Registers the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information. +@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n + #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n + #QURT_PROCESS_DUMP_CB_ERROR -- After threads are frozen and captured. \n + #QURT_PROCESS_DUMP_CB_ROOT -- After threads are frozen and captured, and CB_ERROR type of callbacks + are called. +@param[in] priority Priority. + +@return +#QURT_EOK -- Success \n +Other values -- Failure + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority); + +/**@ingroup func_qurt_process_dump_deregister_cb +Deregisters the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information to deregister. +@param[in] type Callback type. + +@return +#QURT_EOK -- Success.\n +Other values -- Failure. + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type); + +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_set_rtld_debug +Sets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in] address rtld_debug address. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address); + +/**@ingroup func_qurt_process_get_rtld_debug +Gets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address); +/** @endcond */ +/**@ingroup func_qurt_process_exit +Exits the current user process with an exit code. + +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exit(int exitcode); + +/**@ingroup func_qurt_process_kill +Kills the process represented by the PID with the exit code. + +@param[in] pid PID of the process to kill. +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_kill(int pid, int exitcode); + + +/**@ingroup func_qurt_debugger_register_process +Registers the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. +@param[in] adr Address. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_register_process(int pid, unsigned int adr); + + +/**@ingroup func_qurt_debugger_deregister_process +Deregister the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_deregister_process(int pid); + +/**@ingroup func_qurt_process_exec_callback +Executes callbacks in the user process as indicated by the client_handle argument. + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] callback_fn Callback function to execute. +@param[in] stack_base Stack address to use. +@param[in] stack_size Stack size. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exec_callback(int client_handle, + unsigned callback_fn, + unsigned stack_base, + unsigned stack_size); + +/**@ingroup func_qurt_process_get_pid +Gets the process ID of the process that the client_handle argument represents. + +@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id() + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] pid Pointer to the address to store the PID. + +@return +#QURT_EOK -- Success +#QURT_EFATAL -- pid pointer passed as NULL + +@dependencies +None. +*/ +int qurt_process_get_pid(int client_handle, int * pid); + +/**@ingroup func_qurt_process_get_dm_status +Gets the debugging session status on the process represented by the pid argument. + +@param[in] pid Process ID +@param[in,out] status Address to store the status: \n + #QURT_DEBUG_NOT_START \n + #QURT_DEBUG_START + +@return +#QURT_EOK - Success \n +#QURT_EINVALID - Error + +@dependencies +None. +*/ +int qurt_process_get_dm_status( unsigned int pid, unsigned int *status); + + +/**@ingroup func_qurt_process_suspend_threads + Suspends user threads in a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in GuestOS/root process. + After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel + until they resume later. + + This function has one optional argument with one default option. + #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + This function call is a synchronous call, the function returns after the relevant threads are + completely suspended. + + If some user threads in the target user process are set as non-suspendable, this function call does + not suspend these threads. + + If the target user process is already suspended, this function call returns success as the + confirmation on the user process suspending. + + QuRT debugger monitor threads in the target user process are non-suspendable, this function call does + not suspend the threads. + + If the target user process is a secure user process, or a CPZ process, this function call returns error + without suspending the target user process. + + If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call + does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended + when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success while the user thread can be running in GuestOS, and is suspended + when exiting the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid process_id input \n + #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_suspend_threads (unsigned int process_id, unsigned int option); + + +/**@ingroup func_qurt_process_resume_threads + Resumes a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in the guest OS/root process. + After the user threads in the target user process resume, the kernel scheduler + can schedule the user threads to run based on their thread priorities. + + This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which + resumes user threads in the target user process. + + This is an asynchronous function, it returns after the kernel moves the user thread from + suspended state to runnable state. The threads are scheduled to run based on their thread priorities. + + This function call does not resume threads in the target user process that have been set as non-resumable. + + If the target user process have already resumed, this function call confirms that the user process resumes + by returning success. + + If the target user process is a secure user process or a CPZ process, this function call returns an error without + resuming operation. + + If user threads in the target user process run in the guest OS/root process via QDI call, this function + call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits + the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process. + + @return + #QURT_EOK -- Success + #QURT_EINVALID -- Failure because of invalid process_id input. + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_resume_threads (unsigned int process_id, unsigned int option); + +/**@ingroup func_qurt_process_vtcm_window_set + Set a VTCM access window for a process. + The caller thread needs to be in SRM process. + + This is an synchronous function, it ensures all running threads of the process have the requested + window in effect.The requested view for all non-running thread will take in effect when they get + scheduled. + + @param[in] pid Process identifier. + @param[in] enable QURT_VTCM_WINDOW_ENABLE enforces VTCM access window defined by high and low offset. + QURT_VTCM_WINDOW_DISABLE high and low offset is ignored and VTCM access is fully + disabled for the process. + @param[in] high_offset Specifies the high window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT restore high offset to reset value. + @param[in] low_offset Specifies the low window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value. + + @note1hang + when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT and low offset is set as + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled + via MMU mapping for the process. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset); + +/**@ingroup func_qurt_process_vtcm_window_get + Get the VTCM window for a process. + The caller thread needs to be in SRM process. + + + @param[in] pid Process identifier. + @param[out] enable address to store enable status if set + @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM + @param[out] low_offset address to return low window offset, in 4K increments, from the base address of the VTCM. + + @note1hang + User must first check the value of enable returned before checking high and low offset. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset); + +/**@ingroup func_qurt_process_set_group_config + Enable thread groups in the process with the ceiling priorities setup + + @param[in] process_id Process identifier. + @param[in] group_bitmask 64-bit mask of active thread groups + @param[in] ceiling_priorities array of ceiling priorities for thread group + + @note1hang + This API can only be called by root PD and can only be called once for each process, otherwise it will be + rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all + exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling + priority of group 0, it will be lowered to the ceiling value. + Examples 1: + group_bitmask = 0xD7; //'b11010111 + ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care + Exmaples 2: + group_mask = 0x5; //'b101 + ceiling_priorities[] = {240, 0, 20}; // 0 - does not care + + + @return + #QURT_EOK -- Success. + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_ENOTALLOWED -- The group has been configured already. + + @dependencies + None. + */ +int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask, + unsigned char *ceiling_priorities); + + +/**@ingroup func_qurt_process_stid_set + Set the specified stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[in] stid stid to be set + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level. + All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process. + When a non-default group_id is specified, the stid is set only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid); + +/**@ingroup func_qurt_process_stid_get + Get the stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[out] Pointer to a variable to return stid + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid. + When a non-default group_id is specified, the stid is returned only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_profile.h new file mode 100755 index 0000000000000..2a50c461440f6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_profile.h @@ -0,0 +1,98 @@ +#ifndef QURT_PROFILE_H +#define QURT_PROFILE_H +/** + @file qurt_profile.h + QuRT profiling support. + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup profiling_macros +@{ */ +#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */ +#define QURT_PROFILE_ENABLE 1 /**< Enable profiling. */ + +typedef unsigned int qurt_profile_param_t; + +#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */ + +/** @} */ /* end_addtogroup profiling_macros */ + +/** @addtogroup profiling_types + @{ */ +/** Profiling results. */ +typedef union +{ + /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME. */ + struct + { + unsigned int ticks; /**< Cumulative ticks the thread was ready. */ + } thread_ready_time; + +} qurt_profile_result_t; +/** @} */ /* end_addtogroup profiling_types */ + +/**@ingroup func_qurt_profile_enable2 + * Starts profiling of a specific parameter on a specific thread (as applicable). + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of the thread (if applicable) for which the specified + * paramter must be profiled. + * @param[in] enable #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- + * enable + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EALREADY -- Measurement already in progress or already stopped \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_enable2 ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + int enable +); + +/**@ingroup func_qurt_profile_get + * Gets the value of the profiling parameter that was previously enabled. + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of thread (if applicable) for which the specified + * profiling paramter must be retrieved. + * @param [out] result Profiling result associated with the parameter for the specified + * thread (if applicable). + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EFAILED -- Operation failed; profiling was not enabled \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_get ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + qurt_profile_result_t * result +); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ptrace.h new file mode 100755 index 0000000000000..622304dd92865 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ptrace.h @@ -0,0 +1,37 @@ +/*============================================================================= + + qurt_ptrace.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SYS_PTRACE_H__ +#define __SYS_PTRACE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +enum __ptrace_request +{ + /** + Indicates that the process making this request is requesting to be traced. + */ + PTRACE_TRACEME = 0, + PTRACE_EXT_IS_DEBUG_PERMITTED = 500 +}; + +long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data); + +#ifdef __cplusplus +} +#endif + +#endif //__SYS_PTRACE_H__ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi.h new file mode 100755 index 0000000000000..705408e5cfc6f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi.h @@ -0,0 +1,185 @@ +#ifndef QDI_H +#define QDI_H + +/** + @file qurt_qdi.h + @brief Prototypes of QuRT Driver Invocation API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_qdi_open + Opens the specified driver for subsequent operations. + qurt_qdi_open() is the primary mechanism by which a driver user can + obtain a QDI handle. The user provides the name of the driver to the + qurt_qdi_open call, and gets back a handle referencing + the named driver. \n + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_open function prototype is not actually defined as a varargs. + + + @param[in] p Driver name. + @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, + and should follow the POSIX open() convention. \n + - flags -- Optional second parameter (POSIX flags), the handle + access requested (read-only, write-only, or read-write, + for instance) and other flags such as whether the call + should create a new device or only open an existing + device. \n + - mode -- Optional third parameter (POSIX mode); permissions to + configure when a new device is created. @tablebulletend + + @return + Negative value -- Error. \n + Non-negative value -- Success, this result value serves as a handle to the + opened driver. + @dependencies + None. + */ +// int qurt_qdi_open(); +#define qurt_qdi_open(p,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__) + +#define qurt_qdi_open_dt(p,q,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__) + +/**@ingroup func_qurt_qdi_handle_invoke + Performs a generic driver operation, which (depending on the specified operation) can be + either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} + or a driver-specific operation. + The user provides a QDI handle and an integer + method number, along with 0 to 8 optional 32-bit arguments. + The device driver invocation function is invoked with the + same method number and 0 to 8 optional arguments. The + return value from the invocation function is passed back to + the user as the return value of qurt_qdi_handle_invoke. + + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_handle_invoke() function prototype is not actually defined as a + varargs function (and would break if it were defined this way). + + @param[in] h Driver handle. + @param[in] m Integer number for the operation to perform. + @param[in] ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n + arg1 -- First parameter \n + arg2 -- Second parameter \n + arg3 -- Third parameter \n + arg4 -- Fourth parameter \n + arg5 -- Fifth parameter \n + arg6 -- Sixth parameter \n + arg7 -- Seventh parameter \n + arg8 -- Eighth parameter + + @return + Integer value defined by the device driver. \n + -1 -- Error. + + @dependencies + None. + */ +// int qurt_qdi_handle_invoke(); +#define qurt_qdi_handle_invoke(h,m,...) \ + _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__) +#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c) +#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d)) +#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e)) +#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) +int qurt_qdi_qhi3(int,int,int); +int qurt_qdi_qhi4(int,int,int,int); +int qurt_qdi_qhi5(int,int,int,int,int); +int qurt_qdi_qhi6(int,int,int,int,int,int); +int qurt_qdi_qhi7(int,int,int,int,int,int,int); +int qurt_qdi_qhi8(int,int,int,int,int,int,int,int); +int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int); + +/**@ingroup func_qurt_qdi_write + Writes data to the specified driver. + A predefined invocation routine for drivers that + support a POSIX-like write functionality. + qqurt_qdi_write(handle, buf, len) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data to write is stored. + @param[in] len Number of bytes of data to write. + + @return + Non-negative integer -- Number of bytes written. \n + Negative error code -- Write could not take place. + + @dependencies + None. + */ +int qurt_qdi_write(int handle, const void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_read + User-visible API to read data from a QDI handle. + A predefined invocation routine for drivers that + support a POSIX-like read functionality. + qurt_qdi_read(handle, buf, len) is equivalent to: + qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data read is stored. + @param[in] len Number of bytes of data to read. + + @return + Non-negative integer number -- Bytes read. \n + Negative error code -- Read could not take place. + + @dependencies + None. + */ +int qurt_qdi_read(int handle, void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_close + Closes the specified driver, releasing any resources associated with the open driver. + User-visible API to close a QDI handle. + + This API should be called when the user is done using a + QDI-based handle. When this function is called, the driver can release + any resources held and perform other necessary cleanup + operations. qurt_qdi_close(handle) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle) + + @param[in] handle Driver handle. + + @return + 0 -- Success.\n + Negative error code -- Failure. + + @dependencies + None. + */ +int qurt_qdi_close(int handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_constants.h new file mode 100755 index 0000000000000..4866fada067f0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_constants.h @@ -0,0 +1,193 @@ +#ifndef QDI_CONSTANTS_H +#define QDI_CONSTANTS_H + +/** + @file qurt_qdi_constants.h + @brief Predefined invocation methods for drivers. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Method numbers used for QDI. +|| +|| Intended grouping of method numbers for QDI +|| including future usage: +|| +|| Method 0 should always be unused and not responded to by +|| any driver. +|| Methods 1 and 2 are reserved for name registration and +|| name lookup. +|| Methods 3 through 31 are reserved for POSIX-type operations +|| on open handles. +|| Methods 32 through 127 are reserved for the QDI infrastructure +|| and may be extended in the future to provide standard +|| driver debug services, management services, and system +|| notifications. +|| Methods 128 through 255 are reserved for the use of automatically +|| generated methods such as might be generated by an IDL (interface +|| definition language). The infrastructure may be extended to +|| perform services on these methods based on information provided +|| by the IDL, such as automatic buffer validation, etc. These +|| method numbers should not be used for any "ad hoc" methods. +|| Methods with number >= 256 are "private" method numbers that are +|| outside the scope of the QDI infrastructure. Drivers that want +|| to generate and consume their own "ad hoc" methods are free to +|| use these method numbers as they wish. The infrastructure does +|| not generate these method numbers or respond to them, but +|| passes them on unmolested. +|| +|| All driver implementations *should* return a value of +|| -1 when called with an unsupported method. The standard error +|| return value for POSIX APIs is -1, so we emulate that behavior +|| here. +*/ +/** @cond */ +#define QDI_UNUSED 0 +#define QDI_DEVNAME_REGISTER 1 +#define QDI_OPEN 2 +#define QDI_CLOSE 3 +#define QDI_READ 4 +#define QDI_WRITE 5 +#define QDI_IOCTL 6 +#define QDI_MMAP 7 +#define QDI_OS_FILEOPEN 8 +#define QDI_FLEN 9 +#define QDI_UNLINK 10 +#define QDI_FTELL 22 +#define QDI_SEEK 23 +#define QDI_FSTAT 24 + +#define QDI_FSNAME_REGISTER 150 +#define QDI_FS_OPEN 151 +#define QDI_MMAP2 153 +#define QDI_MPROTECT2 154 +#define QDI_MUNMAP2 155 + +#define QDI_CLIENT_HANDLE_OBJREF_GET 10 + +#define QDI_OS_PROCESS_LOAD 12 +#define QDI_OS_PROCESS_CHOOSE_ASID 13 + +#define QDI_OS_SET_GP 26 +#define QDI_CLIENT_HANDLE_CALLBACK 27 + +#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T 19 //reused +#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80 +#define QDI_CLIENT_HANDLE_HANDLE_RELEASE 81 +#define QDI_CLIENT_HANDLE_COPY_FROM_USER 82 +#define QDI_CLIENT_HANDLE_COPY_TO_USER 83 +#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE 86 +#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS 87 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK 41 +#define QDI_CLIENT_HLOSPOOL_INFO_GET 90 +#define QDI_CLIENT_HLOSPOOL2_INFO_GET 96 + +#define QDI_CLIENT_PID 44 +#define QDI_CLIENT_ASID QDI_CLIENT_PID + +#define QDI_OS_CLIENT_INFO_GET 48 + +#define QDI_OS_MEM_LOOKUP_PHYSADDR 57 + +#define QDI_OS_THREAD_ITERATOR_CREATE 68 +#define QDI_OS_THREAD_ITERATOR_NEXT 69 + +#define QDI_OS_SYSENV 78 + +#define QDI_REGION_USERMALLOC_INIT 180 // This method is for generic handle + + +#define QDI_CLIENT_HANDLE_USER_MALLOC 84 +#define QDI_CLIENT_HANDLE_USER_FREE 85 + +#define QDI_SIGNAL_GROUP_SIGNAL_CREATE 96 +#define QDI_SIGNAL_GROUP_WAIT 98 +#define QDI_SIGNAL_GROUP_POLL 99 +#define QDI_SIGNAL_SET 96 +#define QDI_SIGNAL_CLEAR 97 +#define QDI_SIGNAL_WAIT 98 +#define QDI_SIGNAL_POLL 99 + +#define QDI_OS_WAIT_FOR_MAIN_REAPER 104 + +#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL 105 +#define QDI_CLIENT_HANDLE_REFPROXY_ADD 106 +#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE 107 + +#define QDI_CLIENT_HANDLE_DETACH 116 + +#define QDI_OS_RESERVED1 139 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK2 142 + +#define QDI_DT_REGISTER 158 +#define QDI_OPEN_DEVICE 159 +#define QDI_OPEN_FROM_DT 160 + +#define QDI_PRIVATE 256 /* Method numbers beginning at 256 + are private method numbers, which + are device-specific and available + for use by device implementors. */ +/* +|| Permission bitmasks for use with qurt_qdi_lock_buffer(). +|| +|| Make sure these match with permission values from qurt_perm_t. +*/ +/** @endcond */ + +/** @addtogroup driver_support_constants +@{ */ +#define QDI_PERM_W 2 /**< Write access. */ +#define QDI_PERM_R 1 /**< Read access. */ +#define QDI_PERM_RW (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */ + +#define QDI_HANDLE_LOCAL_CLIENT 3 /**< Local client. */ +#define QDI_HANDLE_GENERIC 4 /**< Generic. */ + +#define QDI_REFCNT_BASE 0x510000 /**< */ +#define QDI_REFCNT_MAXED 0x51FFFD /**< */ +#define QDI_REFCNT_INIT 0x51FFFE /**< Driver object is temporary and is eventually deleted.*/ +#define QDI_REFCNT_PERM 0x51FFFF /**< Driver object is permanent and is never deleted. */ +/** @} */ /* end_addtogroup driver_support_constants */ + +/** @cond */ +/* +|| Flags used by process loaders. +*/ + +#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT 0x1 /* Set this flag to request the loaded process + to have island residency. */ +#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT 0x2 /* Set this flag to request the loaded process + to have root residency, for example, DL Pager. */ +/* +|| Constants used for qurt_event register API, type field. +*/ + +#define QURT_PROCESS_EXIT 1 + +/* +|| Constants used by QDI extensions. +*/ + +#define QURT_QDI_SINGLETON_TYPE_TRUE 0 +#define QURT_QDI_SINGLETON_TYPE_FALSE 1 +#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS 2 +/** @endcond */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QDI_CONSTANTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_driver.h new file mode 100755 index 0000000000000..e044e25f1bb72 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_driver.h @@ -0,0 +1,868 @@ +#ifndef QURT_QDI_DRIVER_H +#define QURT_QDI_DRIVER_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "stddef.h" +#include "qurt_qdi.h" +#include "qurt_types.h" +#include "qurt_callback.h" +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" +#include "qurt_mutex.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| This gives the canonical form for the arguments to a QDI +|| driver invocation function. The arguments are as follows: +|| +|| int client_handle (R0) QDI handle that represents the client +|| that made this QDI request. If the +|| client is remote, this is a +|| variable handle; if the client is local +|| (same thread and process), this is +|| set to QDI_HANDLE_LOCAL_CLIENT. +|| +|| qurt_qdi_obj_t *obj (R1) Points at the qdi_object_t structure +|| on which this QDI request is being made. +|| The qdi_object_t structure is usually +|| the first element of a larger structure +|| that contains state associated with the +|| object; because it is usually the first +|| element, the object pointers can be freely +|| interchanged through casts. +|| +|| int method (R2) Integer QDI method that represents +|| the request type. +|| +|| qurt_qdi_arg_t arg1 (R3) First three general purpose arguments +|| qurt_qdi_arg_t arg2 (R4) to the invocation function are passed in +|| qurt_qdi_arg_t arg3 (R5) these slots. +|| +|| qurt_qdi_arg_t arg4 (SP+0) Arguments beyond the first three are +|| qurt_qdi_arg_t arg5 (SP+4) passed on the stack. +|| qurt_qdi_arg_t arg6 (SP+8) +|| qurt_qdi_arg_t arg7 (SP+12) +|| qurt_qdi_arg_t arg8 (SP+16) +|| qurt_qdi_arg_t arg9 (SP+20) +|| +|| The canonical form of the invocation function takes a +|| total of 12 arguments, but not all of them are used. In general, +|| the QDI infrastructure only passes those arguments provided by +|| the caller; if the invocation function accesses additional +|| arguments beyond those provided by the caller, the values are not +|| useful. +*/ +/** @cond */ +#define QDI_INVOKE_ARGS \ + int, struct qdiobj *, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define QDI_EXT_INVOKE_ARGS \ + int, qurt_qdi_man_obj_t*, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define BUFFER_LOCK 1 +#define BUFFER_UNLOCK 0 + +struct qdiobj; +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef union { + void *ptr; /**< Pointer to the driver handle. */ + int num; /**< Method number. */ +} qurt_qdi_arg_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI driver version */ +typedef union { + int num; + struct { + short major; /** Driver major version number. */ + short minor; /** Driver minor version number. */ + }; +} qurt_qdi_version_t; + +typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS); +typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *); +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef struct qdiobj { + qurt_qdi_pfn_invoke_t invoke; /**< Invocation function that implements the driver methods.*/ + int refcnt; /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of + references to a driver instance. */ + qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance + of the driver object.*/ +} qurt_qdi_obj_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI managed object */ +typedef struct qurt_qdi_man_obj +{ + qurt_qdi_obj_t qdi_obj; + union + { + struct qurt_qdi_ext_driver * opener_obj; + struct qurt_qdi_ext_device * device_obj; + }; +}qurt_qdi_man_obj_t; + +typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS); +typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj); +typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device); + +typedef struct qurt_qdi_ext_obj_info{ + qurt_qdi_man_obj_t *obj; + int qdi_client_id; + struct qurt_qdi_ext_obj_info *next; +}qurt_qdi_ext_obj_info_t; +typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr; + +/** QuRT QDI device */ +//temporarily add this back while there are still drivers who statically define this structure +struct qurt_qdi_device { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; +}; +typedef struct qurt_qdi_device qurt_qdi_man_device; + +struct qurt_qdi_ext_driver { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + qurt_qdi_ext_pfn_create_device_t create_device; + qurt_qdi_version_t version; + qurt_qdi_ext_pfn_probe_t probe; + const char* compatible; + struct qurt_qdi_ext_device * device_list; + //qurt_qdi_ext_device_ptr device_list; +}; +typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t; +//above replaces qurt_qdi_man_device + +extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *); +extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *); + +extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS); +/** @endcond */ +/**@ingroup func_qurt_qdi_method_default + Processes a method that is unrecognized or unsupported in the driver invocation function. + All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded + to this function. + + @note1hang Invocation functions must process all unrecognized or unsupported methods + by calling this function. + + @return + None. + + @dependencies + None. +*/ +extern int qurt_qdi_method_default(QDI_INVOKE_ARGS); + +/**@ingroup func_qurt_qdi_handle_create_from_obj_t + Allocates a new device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[out] obj Pointer to the driver object. + + @return + Non-negative integer -- Success; this value is the new handle. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_invoke + Allocates a new island device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). + @param[in] obj Pointer. + + @return + Non-negative integer value that is the new handle -- Success. \n + Negative return value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_release + Deallocates the specified device handle. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] handle_to_release Handle to release. + + @return + 0 -- Success. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_RELEASE, + handle_to_release); +} + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_handle(int client_handle, int object_handle) +{ + qurt_qdi_obj_t *ret; + + ret = NULL; + + qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_OBJREF_GET, + object_handle, + &ret); + + return ret; +} + +/**@ingroup func_qurt_client_add_memory + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size); + +/**@ingroup func_qurt_client_add_memory2 + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting 36-bit address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size); + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr) +{ + qurt_qdi_obj_t * ret = NULL; + + if (qurt_qdi_obj_ref_inc(objptr) < 0) { + ret = NULL; + } else { + ret = objptr; + } + + return ret; +} + +static __inline void +qurt_qdi_objref_release(qurt_qdi_obj_t *objptr) +{ + if (qurt_qdi_obj_ref_dec(objptr) == 1) { + (*objptr->release)(objptr); + } +} + +/**@ingroup func_qurt_qdi_copy_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the driver buffer. + @param[in] src Base address of the user buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_FROM_USER, + dest, src, len); +} + +/**@ingroup qurt_qdi_copy_string_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param dest Base address of the driver buffer. + @param src Base address of the user buffer. + @param len Number of bytes to copy. NOTE: This is the destination buffer length. + + @return + Negative error result -- privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len); + +/**@ingroup func_qurt_qdi_copy_to_user + Copies the contents of a driver memory buffer to user memory. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the user buffer. + @param[in] src Base address of the driver buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_TO_USER, + dest, src, len); +} + +/**@ingroup func_qurt_qdi_safe_cache_ops + Do cache operations on user memory + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] addr Base address of the user memory. + @param[in] size Size of the user memory. + @param[in] opcode Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...) + @param[in] type Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE) + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size, + qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SAFE_CACHE_OPS, + addr, size, opcode, type); +} + + +/**@ingroup func_qurt_qdi_buffer_lock + Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI + client. + + This function is used to permit a trusted driver to safely access memory that is + provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + + This function performs the following security checks: \n + - Verifies that the entire buffer is accessible to the client. \n + - Ensures that the pointer remains valid for the remainder of the QDI driver + operation. \n + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] buf Pointer to the base address of the client buffer address. + @param[in] len Buffer length (in bytes). + @param[in] perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + - #QDI_PERM_R -- Read access \n + - #QDI_PERM_W -- Write access \n + - #QDI_PERM_RW -- Read/write access @tablebulletend + @param[out] obuf Pointer to the buffer address that the driver must use to access the buffer. + + @return + Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n + Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission. + In this case the QDI driver call must be terminated cleanly, with an appropriate error code + returned to the client. \n + Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the + pointer passed in as buf -- even if the user process changes the mapping of memory at buf, + the mapping of memory at *obuf remains valid until the driver invocation completes. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK, + buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_lock2 + Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI + client. + This API permits a trusted driver to safely access memory + provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + This function performs the following security checks: \n + -- Entire buffer is accessible to the client. \n + -- Entire buffer is mapped with permissions passed in perms field \n + -- Entire buffer is physically contiguous \n + In addition to the security checks, the API also locks the client mapping such that the client + cannot remove the mapping while the physical memory is used by the trusted + driver. \n + + @note1 Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not + pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client. If the client exits abruptly, the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Safe buffer + unmapping or user buffer unlock is not supported in Island mode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + -- #QDI_PERM_R -- Read access \n + -- #QDI_PERM_W -- Write access \n + -- #QDI_PERM_RW -- Read/write access \n + @param obuf Optional parameter that returns a pointer to the buffer address that + the driver must use to access the buffer. If NULL is passed, the API + only performs security checks and does not create a mapping to access the user buffer in + a safe way. + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n + QURT_EFAILED -- Mapping cannot be created for the trusted driver. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_LOCK, buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_unlock + This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping + created for the driver is removed. Client mapping for the user buffer is + unlocked. + + @note1 Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not + pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client, and if the client exits abruptly, all the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Actual + unmapping of driver accessible memory or unlocking of the buffer is not + supported in Island bode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param obuf Safe buffer address that was returned in the obuf field after calling + qurt_qdi_buffer_lock2(). + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. \n + other results -- Safe buffer unmapping failed or unlocking of user buffer failed \n. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len, + void *obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_UNLOCK, buf, len, obuf); +} + +/**@ingroup func_qurt_qdi_user_malloc + Allocates memory area in the QDI heap that is read/write accessible to both the driver and + the client. \n + @note1hang The QDI heap has a limited amount of memory available, and only the + device driver can free the allocated memory. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param size Size. + + @return + Non-zero -- Success; this returned value points to the allocated memory area. \n + Zero -- Error. + + @dependencies + None. +*/ +void *qurt_qdi_user_malloc(int client_handle, unsigned size); + +/**@ingroup func_qurt_qdi_user_free + Deallocates memory area in the QDI heap. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param ptr Pointer. + + @dependencies + None. +*/ +void qurt_qdi_user_free(int client_handle, void *ptr); + +/**@ingroup funct_qurt_qdi_client_detach + Detaches a client (a process), indicating that the client does not + participate in the qurt_wait() mechanism. This behavior + is opt-in and irrevocable. When a client is detached, it can + not be un-detached. + + @param client_handle Handle of the client to detach. + + @return + Zero -- Success. Detachable clients always return success. + Nonzero value -- client_handle did not refer to a + detachable user client. + + @dependencies + None. +*/ +static __inline int qurt_qdi_client_detach(int client_handle) +{ + return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH); +} + +/**@ingroup func_qurt_qdi_signal_group_create + Creates a new signal group for use in a device driver. + A QDI signal group contains up to 32 signals, which can be operated on either + individually (using the qurt_qdi_signal_* functions) or as a group (using the + qurt_qdi_signal_group_* functions). \n + @note1hang Driver implementation is responsible for using the proper signal group + handle in any given situation. \n + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param p_signal_group_handle_local Returns a handle intended for use by code that + resides in the same context and process as the created signal group + (for example, the device driver implementation that allocated the + signal group). + @param p_signal_group_handle_remote Returns a handle intended for use by code + that resides in a different context and process than the created signal group + (for example, the user-mode client of an OS driver). + + @return + Zero return value indicates success.\n + Negative return value indicates could not create signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_create(int client_handle, + int *p_signal_group_handle_local, + int *p_signal_group_handle_remote) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE, + p_signal_group_handle_local, + p_signal_group_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_group_wait + Suspends the current thread until any of the signals are set in the specified signal group. + + If a signal is set in a signal group object, and a thread waits on the signal group object, + the thread is awakened. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @param signal_group_handle Handle of the signal group. + + @return + If the client is remote: + QURT_EOK -- Wait complete \n + QURT_ECANCEL -- Wait cancelled.\n + If the client is local, returns a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_wait(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_group_poll + Returns a value that indicates if any of the signals are set in the specified signal group. + + @param signal_group_handle Handle of the signal group. + + @return + 1 -- Indicates whether any of the signals are set in the signal group.\n + 0 -- Indicates that none of the signals are set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_poll(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_POLL); +} + + +/**@ingroup func_qurt_qdi_signal_create + Creates a new signal in the specified signal group. + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @note1hang Driver implementation is responsible for using the proper signal handle in + any given situation. + + @param signal_group_handle Handle of an existing signal group. + @param p_signal_handle_local Returns a handle intended for use by code that resides in + the same context and process as the created signal (for example, + the device driver implementation that allocated the signal). + @param p_signal_handle_remote Returns a handle intended for use by code that resides in + a different context and process than the created signal (for + example, the user-mode client of an OS driver). + + @return + Nonzero value -- No more signals can be created in the specified + signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_create(int signal_group_handle, + int *p_signal_handle_local, + int *p_signal_handle_remote) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_SIGNAL_CREATE, + p_signal_handle_local, + p_signal_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_set + Sets the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_set(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_SET); +} + +/**@ingroup func_qurt_qdi_signal_clear + Clears the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_clear(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_CLEAR); +} + +/**@ingroup func_qurt_qdi_signal_wait + Suspends the current thread until the specified signal is set. + If a signal is set in a signal object, and a thread waits on the signal object, the + thread is awakened. If the awakened thread has higher priority than the current thread, a + context switch may occur. + + @param signal_handle Handle of the signal. + + @return + If client is remote: + QURT_EOK -- Wait complete. \n + QURT_ECANCEL -- Wait cancelled.\n + If client is local, return a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_wait(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_poll + Returns a value that indicates if the specified signal is set. + + @param signal_handle Handle of the signal. + + @return + 1 -- Signal is set. \n + 0 -- Signal is not set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_poll(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_POLL); +} + +/**@ingroup func_qurt_qdi_devname_register + Registers a QDI device with the generic QDI object in the + current QDI context. + + This function registers an exact name or a directory prefix with a QDI opener object. + Future invocations of qurt_qdi_open() in the context of the caller invokes the + opener object if a match is detected. + + Directory prefix names are specified by ending the name with a forward slash character. + + Example of an exact name: + @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode + + Example of a directory prefix: + @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode + + Given the two registrations shown above, the only qurt_qdi_open() requests to + direct to the foobar_opener object are requests for the exact name + "/dev/foobar", Any request beginning with "/pipedev/" is directed to the + pipedev_opener object. + + The pipedev invocation function presumably examines the name argument to + determine exactly how to handle the request. The name is passed to the invocation + function in the a1.ptr argument (Section @xref{sec:invocationFunction}). + + @param name Device name or device name prefix. + @param opener Pointer to the opener object for the device. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. + + @dependencies + None. + */ +static __inline int qurt_qdi_devname_register(const char *name, + qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, + QDI_DEVNAME_REGISTER, + name, + opener); +} + +// Macros for backward compatibility with deprecated APIs +// (These will go away soon) + +#define qurt_qdi_register_devname(name, opener) \ + qurt_qdi_devname_register((name), (void *)(opener)) +#define qurt_qdi_new_handle_from_obj_t(handle, obj) \ + qurt_qdi_handle_create_from_obj_t((handle), (obj)) +#define qurt_qdi_release_handle(client_handle, handle) \ + qurt_qdi_handle_release((client_handle), (handle)) +#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \ + qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf)) +#define qurt_qdi_usermalloc(handle, size) \ + qurt_qdi_user_malloc((handle), (size)) +#define qurt_qdi_userfree(handle, ptr) \ + qurt_qdi_user_free((handle), (ptr)) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_ext.h new file mode 100755 index 0000000000000..383e1799a15d6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_ext.h @@ -0,0 +1,58 @@ +#ifndef QURT_QDI_EXT_H +#define QURT_QDI_EXT_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct qurt_qdi_ext_device { + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + struct qurt_qdi_ext_device * next; + char * instance; + fdt_node_handle context; +}; +typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr; + +/**@ingroup func_qurt_qdi_dt_register + Registers a QDI device with the generic QDI object in the current QDI context, + if and only if a compatible device node is found in the device tree. This + function serves as a device tree aware wrapper for qurt_qdi_devname_register(). + + @param name Device name or device name prefix. + @param opener Pointer to QDI ext specialized opener object for the driver. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. +*/ +static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener); +} + +static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name) +{ + device->instance = name; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_imacros.h new file mode 100755 index 0000000000000..c0a8448ac87f8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_imacros.h @@ -0,0 +1,34 @@ +#ifndef QURT_QDI_IMACROS_H +#define QURT_QDI_IMACROS_H + +/** + @file qurt_qdi_imacros.h + @brief Internal macros used for QDI. Mostly consists of tricky (and ugly) + preprocessor hacks that permit us to do varargs function invocations + where we pass optional arguments in registers and where we can do + type casting and checking automatically. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define _QDMPASTE(a,b) _QDMPASTE_(a,b) +#define _QDMPASTE_(a,b) a##b +#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0) +#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_proxy.h new file mode 100755 index 0000000000000..f1d8992ea8811 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_proxy.h @@ -0,0 +1,55 @@ +/*============================================================================= + + qurt_qdi_proxy.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef _QURT_QDI_PROXY_H +#define _QURT_QDI_PROXY_H + +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* APIs allowing operation on the proxy object directly */ +int qurt_qdi_proxy_ref_create(void); + +/* APIs allowing to operate on proxy given a known proxy handle + * 1) using qdi handle of the object + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle); +int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle); + +/* 2) using object reference + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); +int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); + +/* API allowing to associate a proxy object with a particular client given a client handle + * successfule return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_install (int client_handle, int proxy_handle); + +/* APIs allowing operation on proxy object from user client + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_add(int qdi_handle); +int qurt_client_proxy_ref_remove(int qdi_handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_QDI_PROXY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex.h new file mode 100755 index 0000000000000..a013a0bbddb1d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_RMUTEX_H +#define QURT_RMUTEX_H +/** + @file qurt_rmutex.h + Prototypes of rmutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_rmutex_init + Initializes a recursive mutex object. + The recursive mutex is initialized in unlocked state. + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_destroy + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex protects, and continues executing. + + If a thread performs a lock operation on a mutex that is already use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock_timed + Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex is protecting, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked by itself. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + If timeout expires, this wait must be terminated and no access to the mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + + */ +int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + +/**@ingroup func_qurt_rmutex_unlock + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a mutex. When the mutex is + unlocked, the thread waiting on the mutex awakens. If the awakened + thread has higher priority than the current thread, a context switch occurs. + + @note1hang When a thread unlocks a recursive mutex, the mutex is not available until + the balanced number of locks and unlocks has been performed on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock + Attempts to lock the specified recursive mutex.\n + + If a thread performs a try_lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing.\n + If a thread performs a try_lock operation on a recursive mutex that another thread has + already locked, qurt_rmutex_try_lock immediately returns with a nonzero result + value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex_try_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock_block_once + Attempts to lock a mutex object recursively. If the mutex is available, + it locks the mutex. If the mutex is held by the current thread, + it increases the internal counter and returns 0. If not, it returns a + nonzero value. + If the mutex is already locked by another thread, the caller thread is + suspended. When the mutex becomes available again (because the other + thread has unlocked it), the caller thread is awakened and tries to lock + the mutex; and if it fails, this function returns failure with a nonzero + value. If it succeeds, this function returns success with zero. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the qurt_mutex_t object. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex2.h new file mode 100755 index 0000000000000..a37e7e4458c4b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex2.h @@ -0,0 +1,183 @@ +#ifndef QURT_RMUTEX2_H +#define QURT_RMUTEX2_H +/** + @file qurt_rmutex2.h + @brief Prototypes of rmutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT rmutex2 type. + Mutex type used with rmutex2 APIs. + */ +typedef struct { + /** @cond */ + unsigned int holder __attribute__((aligned(8))); /* UGP value of the mutex holder. */ + unsigned short waiters; /* Number of waiting threads. */ + unsigned short refs; /* Number of references to this mutex. */ + unsigned int queue; /* Kernel-maintained futex queuevalue. */ + unsigned int excess_locks; /* Number of excess times the holder has locked the mutex. */ + /** @endcond */ +} qurt_rmutex2_t; +/** @} */ /* end_addtogroup mutex_types */ +/** @cond internal_only*/ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_rmutex2_init + + @deprecated use #qurt_rmutex_init instead. + + Initializes a recursive mutex object. + + The recursive mutex is initially unlocked. + + Objects of type rmutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_destroy + + @deprecated use #qurt_rmutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code must destroy an rmutex2 object prior to + deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures + that all qurt_rmutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_lock + + @deprecated use #qurt_rmutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that the mutex protects, and continues + to execute. + + If a thread performs a lock operation on a recursive mutex that another thread is using, + the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_unlock + + @deprecated use #qurt_rmutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex awakens. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_rmutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() + succeeds immediately, this function behaves similarly, returning 0 for success. + When a call to qurt_rmutex2_lock() does not succeed immediately, this function has + no effect and returns nonzero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sclk.h new file mode 100755 index 0000000000000..a83cf5f1db889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sclk.h @@ -0,0 +1,145 @@ +#ifndef QURT_SCLK_H +#define QURT_SCLK_H +/** + @file qurt_sclk.h + @brief Header file describing the APIs supported by QuRT system SCLK + feature. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + + +/*============================================================================= + + INCLUDE FILES + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + + +/** + Conversion from microseconds to sleep ticks. + */ +#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL) +#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL) +#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL) +#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS +/** + Sleep timer error margin for Qtimer is 192 ticks ~10 us. +*/ +#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq; +#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN + +/*============================================================================= + + DATA DECLARATIONS + +=============================================================================*/ + +/**@ingroup func_qurt_sysclock_get_hw_ticks + @xreflabel{sec:qurt_sysclock_get_hw_ticks} + Gets the hardware tick count.\n + Returns the current value of a 64-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation must be used with care because of the wrap-around behavior. + + @return + Integer -- Current value of 64-bit hardware counter. + + @dependencies + None. + */ +unsigned long long qurt_sysclock_get_hw_ticks (void); + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_32 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_32} + Gets the hardware tick count in 32 bits.\n + Returns the current value of a 32-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 32 bits are the lower 32 bits of the Qtimer counter. + + @return + Integer -- Current value of the 32-bit timer counter. + + @dependencies + None. + */ +static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void) +{ + //Beginning with v61 there is a HW register that can be read directly. + unsigned long count; + __asm__ __volatile__ (" %0 = c30 " : "=r"(count)); + return count; +} + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_16 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_16} + Gets the hardware tick count in 16 bits.\n + Returns the current value of a 16-bit timer counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 16 bits are based on the value of the lower 32 bits in Qtimer + counter, right shifted by 16 bits. + + @return + Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the + Qtimer counter, right shifted by 16 bits. + + @dependencies + None. + */ + + +static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void) +{ + unsigned long ticks; + + //Beginning with v61 there is a HW register that can be read directly. + __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks)); + __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks)); + + return (unsigned short)ticks; +} +unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks); +#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif /* __cplusplus */ + +#endif /* QURT_SCLK_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_secure_proc.h new file mode 100755 index 0000000000000..f40c7deb9bca1 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_secure_proc.h @@ -0,0 +1,53 @@ +#ifndef QURT_SECURE_PROC_H +#define QURT_SECURE_PROC_H + +/** + @file qurt_secure_proc.h + @brief Definitions, macros, and prototypes used for handling secure process + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_process_migrate_secure_process + Migrate the user process to Qurt secure process + + @param secure_phy_address Physical starting address of secure memory + @param secure_memory_size Size of secure memory + @param entry Entry function to secure process + + @return + EOK + Negative return value -- Error. + + @dependencies + None. +*/ +int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size, void entry(unsigned)); + +/**@ingroup qurt_process_get_migration_mem_size + get the size of all writable memory regions in a user PD. This is for preparation on secure process migration. + + @return + size of all writable memory regions in a user PD. + + @dependencies + None. +*/ +int qurt_process_get_migration_mem_size(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sem.h new file mode 100755 index 0000000000000..ee5ce4b2d94ab --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sem.h @@ -0,0 +1,252 @@ +#ifndef QURT_SEM_H +#define QURT_SEM_H +/** + @file qurt_sem.h + Prototypes of semaphore API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup semaphore_types +@{ */ + +/** QuRT semaphore type. */ +typedef union { + /** @cond */ + unsigned int raw[2] __attribute__((aligned(8))); + struct { + unsigned short val; /**< */ + unsigned short n_waiting; /**< */ + unsigned int reserved1; /**< */ + unsigned int queue; /**< */ + unsigned int reserved2; /**< */ + }X; /** @endcond */ +} qurt_sem_t; +/** @} */ /* end_addtogroup semaphore_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_sem_add + Releases access to a shared resource (the specified amount increments the semaphore count value).\n + When a thread performs an add operation on a semaphore, the specified value increments the semaphore count. + The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing. \n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel repeatedly awakens the highest-priority waiting thread and decrements + the semaphore count value until either no waiting threads remain or the + semaphore count value is zero. If any of the awakened threads has higher priority + than the current thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] amt Amount to increment the semaphore count value. + + @return + Unused integer value. + + @dependencies + None. + + */ +int qurt_sem_add(qurt_sem_t *sem, unsigned int amt); + +/**@ingroup func_qurt_sem_up + Releases access to a shared resource. When a thread performs an up operation on a semaphore, + the semaphore count value increments. The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing.\n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel awakens the highest-priority waiting thread and decrements the + semaphore count value. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); } + +/**@ingroup func_qurt_sem_down + Requests access to a shared resource. When a thread performs a down operation on a + semaphore, the result depends on the semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +int qurt_sem_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_down_timed + When a thread performs a down operation on a semaphore, the result depends on the + semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. Terminate the wait when the specified timeout expires. + If timeout expires, terminate this wait and grant no access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration); + +/**@ingroup func_qurt_sem_try_down + @xreflabel{hdr:qurt_sem_try_down} + Requests access to a shared resource (without suspend). When a thread performs a try down + operation on a semaphore, the result depends on the semaphore count value: \n + - The count value is decremented when it is nonzero. The down operation returns 0 as + the function result, and the thread gains access to the shared resource and is free to + continue executing.\n + - The count value is not decremented when it is zero. The down operation returns -1 + as the function result, and the thread does not gain access to the shared resource + and should not continue executing. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + 0 -- Success. \n + -1 -- Failure. + + @dependencies + None. + + */ +int qurt_sem_try_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init + Initializes a semaphore object. + The default initial value of the semaphore count value is 1. + + @param[out] sem Pointer to the initialized semaphore object. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_destroy + Destroys the specified semaphore.\n + @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Semaphores must not be destroyed while they are still in use. If this occur, + the behavior of QuRT is undefined. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_sem_destroy(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init_val + Initializes a semaphore object with the specified value. + + @datatypes + #qurt_sem_t + + @param[out] sem Pointer to the initialized semaphore object. + @param[in] val Initial value of the semaphore count value. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val); + +/**@ingroup func_qurt_sem_get_val + Gets the semaphore count value.\n + Returns the current count value of the specified semaphore. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Integer semaphore count value + + @dependencies + None. + */ +static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;} +int qurt_sem_down_cancellable(qurt_sem_t *sem); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SEM_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_shmem.h new file mode 100755 index 0000000000000..980557323708a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_shmem.h @@ -0,0 +1,89 @@ +#ifndef QURT_SHMEM_H +#define QURT_SHMEM_H + +/** + @file qurt_shmem.h + + @brief + Prototypes of QuRT inter-process shared memory APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MODE_T +#define MODE_T +typedef unsigned int mode_t; +#endif //MODE_T + +/** + * The shm_open() function establishes a connection between a shared memory object and a file descriptor. + * The file descriptor is used by other functions such as mmap() to refer to that shared memory object. + * + * + * @param name Pointer to string naming a shared memory object. Name has to start with "/shm/" + * @param oflag File status flags and file access modes of the open file description. Following + * flags are defined in and supported: + * O_RDONLY: oepn for read access only + * O_RDWR: Open for read or write access + * O_CREAT: If shared memory object doesn't exist, create one. + * @param mode Permission flags (currently ignored) + * + * @return file descriptor (positive number) if operation successful. + * negative error code if failed + * +*/ + +int shm_open(const char * name, int oflag, mode_t mode); + +/** + * The shm_mmap() function create a shared memory mapping in the virtual address space of the + * the calling process. + * + * @param addr The starting address for the new mapping is specified in addr. + * @param len Specifies the lengh of the shared memory region. + * @param prot Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX. + * @param flags Determines whether updates to the mapping is visible or not to other process. Same as + * the one in mmap of POSIX. + * @param fd The starting adddress for the new mapping is returned. + * @param offset unused. + * + * @return The starting adddress for the new mapping is returned. + * negative error code if failed + * +*/ + +void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset); + +/** + * The shm_close() function removes a connection between a shared memory object and a file descriptor. + * If there is no file descriptor connects to the shared memory object, the shared memory object will + * be deleted automatically. Shared memory object has same virtual address in any process. This is + * restriction of single virtual address space. + * + * + * @param fd File descriptor of shared memory object + * + * @return 0 if operation successful. + * negative error code if failed + * +*/ + + +int shm_close(int fd); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal.h new file mode 100755 index 0000000000000..3a89c53394ad5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal.h @@ -0,0 +1,518 @@ +#ifndef QURT_SIGNAL_H +#define QURT_SIGNAL_H + +/** + @file qurt_signal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup signals_types +@{ */ +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 /**< Wait any. */ +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 /**< Wait all. */ + +/*===================================================================== + Typedefs + ======================================================================*/ + + +/** QuRT signal type. + */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int signals; + unsigned int waiting; + unsigned int queue; + unsigned int attribute; + }X; + /** @endcond */ +} qurt_signal_t; + + +/** QuRT 64-bit signal type. + */ +typedef struct { + /** @cond */ + qurt_signal_t signal_sum; + unsigned long long signals; + unsigned long long waiting; + /** @endcond */ +} qurt_signal_64_t; +/** @} */ /* end_addtogroup signals_types */ +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal_init + Initializes a signal object. + Signal returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_init(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_destroy + Destroys the specified signal object. + + @note1hang Signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_destroy(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting on the signal. + + If a thread is waiting on a signal object for any of the specified set of signals to set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared when the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread waits to set any of the signals, or to set all of + them. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_timed + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set or until timeout. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared after the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value that identifies the individual signals in the signal object to wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] signals Bitmask of signals that are set + @param[in] duration Duration (microseconds) to wait. Must be in the range + [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION] + + @return + #QURT_EOK -- Success; one or more signals were set \n + #QURT_ETIMEDOUT -- Timed-out \n + #QURT_EINVALID -- Duration out of range + + @dependencies + Timed-waiting support in the kernel. +*/ +/* ======================================================================*/ +int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, unsigned int *signals, unsigned long long int duration); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_any + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on the thread. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_all + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to set the signal, and 0 indicates not to set it. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_set(qurt_signal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 32-bit word with current signals + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_get(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_clear + Clear signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal_wait_cancellable + @xreflabel{hdr:qurt_signal_wait_cancellable} + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, + unsigned int *return_mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_init + Initializes a 64-bit signal object.\n + The signal argument returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore. + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_init(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_destroy + Destroys the specified signal object. + + @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_destroy(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_wait + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value, which identifies the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set it. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifiying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_64_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 64-bit double word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_clear + Clears signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask); + +#ifdef __cplusplus +} +#endif + +#endif /* QURT_SIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal2.h new file mode 100755 index 0000000000000..43975100cbf75 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal2.h @@ -0,0 +1,340 @@ +#ifndef QURT_SIGNAL2_H +#define QURT_SIGNAL2_H + +/** + @file qurt_signal2.h + @brief Prototypes of kernel signal2 API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 + +/*===================================================================== + Typedefs + ======================================================================*/ + +/** @addtogroup signals2_types +@{ */ +/** qurt_signal2 type. + */ +typedef union { + /** @cond */ + struct{ + unsigned int cur_mask; /* Current set of signal bits that are set. */ + unsigned int sig_state; /* Current state. */ + /* Bit 0 -- in anysignal wait. */ + /* Bit 1 -- in allsignal wait. */ + /* Bit 2 -- in interrupt wait. */ + /* Bits 31-3 -- reference count field. */ + unsigned int queue; /* Kernel-maintained futex queue value. */ + unsigned int wait_mask; /* When sig_state indicates a waiter is present, this is the wait mask. */ + }; + unsigned long long int raw; + /** @endcond */ +} qurt_signal2_t; +/* @} */ /* end_addtogroup signals2_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_init + + @deprecated use #qurt_signal_init instead. + + Initializes a signal2 object. + Signal returns the initialized object. + The signal object is initially cleared. + + Objects of type signal2 solve a potential race condition between + set() and destroy() operations. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + Each mutex-based object has an associated + kernel resource(s), therefore users must call qurt_signal2_destroy() + when this object no longer in use. + */ +/* ======================================================================*/ +void qurt_signal2_init(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_destroy + + @deprecated use #qurt_signal_destroy instead. + + Destroys the specified signal object. + + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont Application code should destroy a signal2 object prior to deallocating it. + Calling qurt_signal2_destroy() before deallocating a + signal2 object ensures completion of all qurt_signal2_set() calls. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_destroy(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait + + @deprecated use #qurt_signal_wait instead. + + Suspends the current thread until the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when + any of the signals specified in the mask are set. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only + when all the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to wait on. + @param[in] attribute Specifies whether the thread waits for any of the signals to be set, or for all of + them to be set. Values:\n + - QURT_SIGNAL_ATTR_WAIT_ANY \n + - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_any + + @deprecated use #qurt_signal_wait_any instead. + + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened when any of the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_all + + @deprecated use #qurt_signal_wait_all instead. + + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened only when all the signals specified in the mask are set. + + @note1hang At most one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_set + + @deprecated use #qurt_signal_set instead. + + Sets signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_get + + @deprecated use #qurt_signal_get instead. + + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the signal object to access. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_get(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_clear + + @deprecated use #qurt_signal_clear instead. + + Clear signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal2_wait_cancellable + + @deprecated use #qurt_signal_wait_cancellable instead. + + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +int qurt_signal2_wait_cancellable(qurt_signal2_t *signal, + unsigned int mask, + unsigned int attribute, + unsigned int *p_returnmask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SIGNAL2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_space.h new file mode 100755 index 0000000000000..2c3f9e4496697 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_space.h @@ -0,0 +1,230 @@ +#ifndef QURT_SPACE_H +#define QURT_SPACE_H +/** + @file qurt_space.h + @brief Prototypes of QuRT process control APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** This flag is a request to the OS to suspend the processes just before calling main() +But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */ +#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP + +/** + * Creates and starts a process from ELF of a specified name. The slash symbols + * "/" or "\" are ignored. Do not include the directory name in the input. This function + * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags. + * + * @param name ELF name of the executable. Name shall not contain directories, + * use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf" + * + * @param return + Process ID -- Success \n + Negative error code -- failure\n + #QURT_EPRIVILEGE -- Caller does not have enough privilege for this operation\n + #QURT_EMEM -- Not enough memory to perform the operation \n + #QURT_EFAILED -- Operation failed \n + #QURT_ENOTALLOWED -- Operation not allowed \n + #QURT_ENOREGISTERED -- Not registered \n + #QURT_ENORESOURCE -- Resource exhaustion \n + #QURT_EINVALID -- Invalid argument value +*/ + +int qurt_spawn_flags(const char * name, int flags); + +/** + Creates and starts a process from an ELF of the specified name. The slash symbols + "/" or "\" are ignored. Do not include the directory name in the input. + + @param name ELF name of the executable. Name shall not contain directories, + use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf". + + @return + Process ID -- Success. \m + Negative error code -- Failure. + +*/ +static inline int qurt_spawn(const char *name) +{ + return qurt_spawn_flags(name,0); +} + +/** + * Returns the process ID of the current process. + * + * @return + * Process ID + * +*/ +#define qurt_getpid qurt_process_get_id + +/** + * The qurt_wait() function waits for status change in a child process. It could be used by parent + * process to block on any child process terminates. + * + * This API returns error if there are no user processes or all user processes got detached. + * + * @param status Pointer to status variable. The variable provides the status value of child process. + * The value comes from exit() system call made by child process. + * + * @return + Process ID of the child process that changes status -- Success \n + * Negative error code -- Failure + * +*/ + +int qurt_wait(int *status); + + +/** @cond */ +/* APIs that allow registering callbacks on spawn of user pd */ +typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr); //no return, since we won't be error checking it in spawn +typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info); +typedef union { + QURT_SPAWN_PFN spawn_pfn; + QURT_CB_PFN cb_pfn; +} qurt_process_callback_pfn_t; +/** @endcond */ + +/** @cond internal_only */ + +/**@ingroup func_qurt_event_register +Sets the specified bits by mask in the signal passed by the caller. The signal gets set +when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal +to be set. + +@datatypes + +@param[in] type QURT_PROCESS_EXIT is the only event that can be registered for. +@param[in] value Indicates the client handle of the process for which the event is registered. +@param[in] signal Pointer to the signal object to set when the event occurs. +@param[in] mask Mask bits to set in the signal. +@param[out] data Pointer to the variable that would receive the exit code of the exiting process. +@param[in] datasize Size of the data variable. + +@return +#QURT_EOK -- Success \n +#QURT_EMEM -- Not enough memory to allocate resources \n +#QURT_EVAL -- Invalid values passed to the API + +@dependencies +None. +*/ +int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size); + +/**@ingroup func_qurt_callback_register_onspawn +Allows registering for a callback on spawn of any user process. + +@datatypes +#QURT_SPAWN_PFN + +@param[in] pFn Callback function to call when any user process is spawned. +@param[in] user_data Pointer to the argument that the callback must be called with. + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data); + +/**@ingroup func_qurt_callback_deregister_onspawn +Allows de-registering callback on spawn. + +@param[in] callback_handle Handle returned by qurt_callback_register_onspawn. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_callback_deregister_onspawn(int callback_handle); + +/**@ingroup func_qurt_process_callback_register +Allows registering for a callback during or after image loading. +Generic callback types: + Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is + loaded, before process thread starts. Callback has no return value and has no info provided + from OS. + pFn - QURT_SPAWN_PFN + type - QURT_PROCESS_CB_GENERIC + arg1 - not used + arg2 - not used + arg3 - not used +Note callback types: + Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP), + or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info + argument in callback is populated with pointer to the mapped note corresponding to the callback. + Callback has return value, loader fails if callback returns a value that is not QURT_EOK. + pFn - QURT_CB_PFN + type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP + arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO) + arg2 - note name + arg3 - not used + +@datatypes + +@param[in] pFn Callback function to call +@param[in] type Callback type +@param[in] user_data Pointer to the argument that the callback must be called with. +@param[in] arg1 Arguments interpreted by OS based on callback type +@param[in] arg2 Arguments interpreted by OS based on callback type +@param[in] arg3 Arguments interpreted by OS based on callback type (currently not used) + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, + qurt_process_cb_type_t type, + void *user_data, + qurt_process_callback_arg_t arg1, + qurt_process_callback_arg_t arg2, + qurt_process_callback_arg_t arg3); + + + +/**@ingroup func_qurt_process_callback_deregister +Allows de-registering callback for imate loading. +@param[in] callback_handle Handle returned by qurt_process_callback_register. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_process_callback_deregister(int callback_handle); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SPACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_consts.h new file mode 100755 index 0000000000000..48a8b6a38c402 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_consts.h @@ -0,0 +1,32 @@ +#ifndef QURT_SRM_CONSTS_H +#define QURT_SRM_CONSTS_H +/** + @file qurt_srm_consts.h + @brief Type definitions for srm + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2020-2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +#define QURT_SRM_WAKEUP_REQUEST 1U << 0 /**< Value = 1: Send wakeup request to the SRM server. */ +#define QURT_SRM_SET_HANDLE 1U << 1 /**< Value = 2: Set the client handle for a new SRM client. */ +#define QURT_SRM_ALLOC_KERNEL_PAGES 1U << 2 /**< Value = 4: Allocate pages from the kernel VA space. */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_driver.h new file mode 100755 index 0000000000000..5489e3dddbcca --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_driver.h @@ -0,0 +1,140 @@ +#ifndef QURT_SRM_DRIVER_H +#define QURT_SRM_DRIVER_H +/** + @file qurt_srm_driver.h + @brief Definitions, macros, and prototypes used by SRM drivers. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + + =============================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Define qurt_srm_driver_t structure, which represents +|| the "registration" object for an SRM driver. +*/ +/** @cond internal_only */ +struct _qurt_srm_driver { + const char *name; + qurt_qdi_obj_t *obj; +}; + +typedef struct _qurt_srm_driver qurt_srm_driver_t; + +/* +|| qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke(). +|| It behaves the same, but it takes a QDI object pointer instead of a handle. +*/ + +#define qurt_srm_object_invoke(o,m,...) \ + _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__) +#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c) +#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d)) +#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e)) +#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) + +int qurt_srm_oi3(int, qurt_qdi_obj_t *, int); +int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int); +int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int); +int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int); +int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int); +int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int); +int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int); +int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int); +int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int); +int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int); + +#define QDI_SRM_INIT 192 + +/* +|| QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure. +|| +|| The three arguments are: +|| unique_id -- Unique C identifier, unused but must be a unique global symbol. +|| name -- Name of the driver by which an SRM client attempts to open it. +|| obj -- Pointer to the singleton object of the driver, which handles things such as +|| initialization and QDI_OPEN requests. +*/ + +#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \ + __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \ + { .name = xname, .obj = xobj } + + +/*@ingroup func_qurt_srm_mapping_create + Creates a memory mapping in pagetable with specified attributes + + @param[in] client_handle Client handle representing the process for which + mapping would be created. + @param[in] pageno_virt pointer to the virtual page. NULL indicates SRM + would indicate the virtual memory. + @param[in] pageno_phys physical page to be used for the mapping + @param[in] page_count number of 4k pages to be mapped + @param[in] cache_attr cache attributes for the mapping + @param[in] perm permissions to be used for the mapping + + @return value greater than 0 indicates a handle which can be passed to + qdi_close() to remove the mapping. Negative value indicates + an error. + + @dependencies + None. +*/ +int qurt_srm_mapping_create(int client_handle, + unsigned *pageno_virt, + unsigned pageno_phys, + unsigned page_count, + qurt_mem_cache_mode_t cache_attr, + qurt_perm_t perm); + + +/**@ingroup func_qurt_srm_get_pid + Gets the PID for the client_handle that is passed. + + @param[in] client_handle Client handle for which PID is required. + + @return PID of the client + Negative PID value '-1' will be returned in case of Error + + @dependencies + None. +*/ +unsigned qurt_srm_get_pid(int client_handle); + + +/*@ingroup func_qurt_srm_get_thread_id + Gets the thread id of the client requesting a service from SRM + + @param[in] None. + + @return thead id of client thread + + @dependencies + None. +*/ +qurt_thread_t qurt_srm_get_client_thread_id(void); + +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_DRIVER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_stid.h new file mode 100755 index 0000000000000..379f46aaa4b80 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_stid.h @@ -0,0 +1,73 @@ +#ifndef QURT_STID_H +#define QURT_STID_H +/** + @file qurt_stid.h + Prototypes of software thread identifier(stid) interface APIs. + A stid is 8 bit identifier that can be assigned to a software thread. + The performance monitor logic uses stid as a counting match criteria + for maskable events. stid is also used by the hardware debugger + (ISDB) to match breakpoints. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2024 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_stid_alloc + Allocate a unique stid + + @param[in] pid Process identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - Allocation success + QURT_ENORESOURCE - No stid available for allocation + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_stid_alloc(unsigned int pid, unsigned int *stid); + +/**@ingroup func_qurt_stid_release + Release the stid. + + + @param[in] pid Process identifier + @param[in] stid STID to release + + @note1hang + User shall ensure to clear the released stid from process or thread(s) + to default value (QURT_STID_DEFAULT) before releasing that stid + + @return + QURT_EOK - Release success + QURT_ENOTALLOWED - Operation not allowed for a pid + QURT_EINVALID - Invalid stid + + @dependencies + None. + */ +int qurt_stid_release(unsigned int pid, unsigned int stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_STID_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread.h new file mode 100755 index 0000000000000..499699e7c72e2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread.h @@ -0,0 +1,1260 @@ +#ifndef QURT_THREAD_H +#define QURT_THREAD_H +/** + @file qurt_thread.h + @brief Prototypes of Thread API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2020-2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +#include +#include "qurt_pmu.h" +#include "qurt_api_version.h" +#endif /* __ASSEMBLER__ */ +#include "qurt_consts.h" +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/* + Bitmask configuration to select DSP hardware threads. + To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL + and the following: \n + - For QDSP6 V2/V3, all six hardware threads are selected \n + - For QDSP6 V3L, all four hardware threads are selected \n + - For QDSP6 V4, all three hardware threads are selected + */ + +#define QURT_THREAD_CFG_BITMASK_HT0 0x00000001 /**< HTO. */ +#define QURT_THREAD_CFG_BITMASK_HT1 0x00000002 /**< HT1. */ +#define QURT_THREAD_CFG_BITMASK_HT2 0x00000004 /**< HT2. */ +#define QURT_THREAD_CFG_BITMASK_HT3 0x00000008 /**< HT3. */ +#define QURT_THREAD_CFG_BITMASK_HT4 0x00000010 /**< HT4. */ +#define QURT_THREAD_CFG_BITMASK_HT5 0x00000020 /**< HT5. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{sec:qurt_thread_cfg} */ + +#define QURT_THREAD_CFG_BITMASK_ALL 0x000000ffU /**< Select all the hardware threads. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_CFG_USE_RAM 0x00000000 /**< Use RAM. */ +#define QURT_THREAD_CFG_USE_TCM 0x00000100 /**< Use TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_BUS_PRIO_DISABLED 0 /**< Thread internal bus priority disabled. */ +#define QURT_THREAD_BUS_PRIO_ENABLED 1 /**< Thread internal bus priority enabled. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_AUTOSTACK_DISABLED 0 /**< Thread has autostack v2 feature disabled. */ +#define QURT_THREAD_AUTOSTACK_ENABLED 1 /**< Thread has autostack v2 feature enabled. */ + +/* + Macros for QuRT thread attributes. + */ +#define QURT_HTHREAD_L1I_PREFETCH 0x1 /**< Enables hardware L1 instruction cache prefetching. */ +#define QURT_HTHREAD_L1D_PREFETCH 0x2 /**< Enables hardware L1 data cache prefetching. */ +#define QURT_HTHREAD_L2I_PREFETCH 0x4 /**< Enables hardware L2 instruction cache prefetching. */ +#define QURT_HTHREAD_L2D_PREFETCH 0x8 /**< Enables hardware L2 data cache prefetching. */ +#define QURT_HTHREAD_DCFETCH 0x10 /**< Enables DC fetch to the provided virtual address. + DC fetch indicates the hardware that a data memory access is likely. + Instructions are dropped when there is high bus utilization. */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{hdr:partition_tcm} */ +/* + Below value is used to create legacy QuRT threads by default. + If a thread has this as the detach_state, the thread can be joined + on until it exits. When we are able to change default behavior of all + QuRT threads to JOINABLE (posix default), we can remove this legacy + behavior. +*/ +#define QURT_THREAD_ATTR_CREATE_LEGACY 0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */ +#define QURT_THREAD_ATTR_CREATE_JOINABLE 1U /**< Create a joinable thread. */ +#define QURT_THREAD_ATTR_CREATE_DETACHED 2U /**< Create a detached thread. */ +/** @} */ /* end_addtogroup thread_macros */ + + +#define QURT_THREAD_ATTR_NAME_MAXLEN 16 /**< Maximum name length. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_RAM 0 /**< Creates threads in RAM/DDR. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_TCM 1 /**< Creates threads in TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT QURT_THREAD_ATTR_TCB_PARTITION_RAM /**< Backward compatibility. */ +#define QURT_THREAD_ATTR_PRIORITY_DEFAULT 254 /**< Priority.*/ +#define QURT_THREAD_ATTR_ASID_DEFAULT 0 /**< ASID. */ +#define QURT_THREAD_ATTR_AFFINITY_DEFAULT (-1) /**< Affinity. */ +#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT 255 /**< Bus priority. */ +#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT 0 /**< Default autostack v2 disabled thread. */ +#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT (-2) /**< Timetest ID. */ +#define QURT_THREAD_ATTR_STID_DEFAULT QURT_STID_DEFAULT /**< STID. */ +#define QURT_THREAD_ATTR_STID_ENABLE 1 /**< Indicate to allocate STID during thread creation. */ + +#define QURT_PRIORITY_FLOOR_DEFAULT 255U /**< Default floor. */ +/** @} */ /* end_addtogroup thread_macros */ + +// Option for suspending thread +#define QURT_THREAD_SUSPEND_SYNCHRONOUS 0x0U // bit#0 +#define QURT_THREAD_SUSPEND_ASYNCHRONOUS 0x1U // bit#0 +#define QURT_THREAD_SUSPEND_KEEP_HMX 0x0U // bit#1 +#define QURT_THREAD_SUSPEND_DETACH_HMX 0x2U // bit#1 + +// Option for resuming thread +#define QURT_THREAD_RESUME_DEFAULT 0x0 + +// Thread property IDs +#define QURT_THREAD_PROPERTY_SUSPENDABLE 0x0U +#define QURT_THREAD_PROPERTY_RESUMABLE 0x1 + +// Thread group +#define QURT_THREAD_DEFAULT_GROUP_ID 0x0U +#define QURT_THREAD_GROUP_ID_MASK 0x3FU + +/** @endcond*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup thread_types +@{ */ +/** @cond rest_reg_dist */ +typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */ + +#define CCCC_PARTITION 0U /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */ +#define MAIN_PARTITION 1U /**< Use the main partition. */ +#define AUX_PARTITION 2U /**< Use the auxiliary partition. */ +#define MINIMUM_PARTITION 3U /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */ +/** @endcond */ + +/** Thread ID type. */ +typedef unsigned int qurt_thread_t; + +/** @cond rest_reg_dist */ +/** Thread attributes. */ +typedef struct _qurt_thread_attr { + + char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */ + unsigned char tcb_partition; /**< Indicates whether the thread TCB resides in RAM or + on chip memory (TCM). */ + unsigned char stid; /**< Software thread ID used to configure the stid register + for profiling purposes. */ + unsigned short priority; /**< Thread priority. */ + unsigned char autostack:1; /**< Autostack v2 enabled thread. */ + unsigned char group_id:6; /**< Group ID. */ + unsigned char reserved:1; /**< Reserved bits. */ + unsigned char bus_priority; /**< Internal bus priority. */ + unsigned short timetest_id; /**< Timetest ID. */ + unsigned int stack_size; /**< Thread stack size. */ + void *stack_addr; /**< Pointer to the stack address base. The range of the stack is + (stack_addr, stack_addr+stack_size-1). */ + unsigned short detach_state; /**< Detach state of the thread. */ + +} qurt_thread_attr_t; +/** @endcond */ + +/** @cond rest_reg_dist */ +/** Dynamic TLS attributes. */ +typedef struct qurt_tls_info { + unsigned int module_id; /**< Module ID of the loaded dynamic linked library. */ + unsigned int tls_start; /**< Start address of the TLS data. */ + unsigned int tls_data_end; /**< End address of the TLS RW data. */ + unsigned int tls_end; /**< End address of the TLS data. */ +}qurt_tls_info; +/** @endcond */ + +/** @} */ /* end_addtogroup thread_types */ + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_thread_attr_init + Initializes the structure used to set the thread attributes when a thread is created. + After an attribute structure is initialized, Explicity set the individual attributes in the structure + using the thread attribute operations. + + The initialize operation sets the following default attribute values: \n + - Name -- NULL string \n + - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT + - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n + - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n + - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n + - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n + - stack_size -- 0 \n + - stack_addr -- NULL \n + - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n + - STID -- #QURT_THREAD_ATTR_STID_DEFAULT + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr) +{ + + attr->name[0] = '\0'; + attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT; + attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT; + attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/ + attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT; + attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT; + attr->stack_size = 0; + attr->stack_addr = NULL; + attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY; + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID; +} + +/**@ingroup func_qurt_thread_attr_set_name + Sets the thread name attribute.\n + This function specifies the name to use by a thread. + Thread names identify a thread during debugging or profiling. + Maximum name length is 16 charactes \n + @note1hang Thread names differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] name Pointer to the character string containing the thread name. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name) +{ + strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN); + attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0'; +} + + +/**@ingroup func_qurt_thread_attr_set_tcb_partition + Sets the thread TCB partition attribute. + Specifies the memory type where a TCB of a thread is allocated. + Allocates TCBs in RAM or TCM/LPM. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] tcb_partition TCB partition. Values:\n + - 0 -- TCB resides in RAM \n + - 1 -- TCB resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition) +{ + attr->tcb_partition = tcb_partition; +} + +/**@ingroup func_qurt_thread_attr_set_priority + Sets the thread priority to assign to a thread. + Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing + the highest priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] priority Thread priority. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority) +{ + attr->priority = priority; +} + +/**@ingroup func_qurt_thread_attr_set_detachstate + Sets the thread detach state with which thread is created. + Thread detach state is either joinable or detached; specified by the following values: + - #QURT_THREAD_ATTR_CREATE_JOINABLE \n + - #QURT_THREAD_ATTR_CREATE_DETACHED \n + + When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread + ID and other resources are reclaimed as soon as the thread exits. When a joinable thread + is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some + thread waits to join on it using a qurt_thread_join() call. + By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY + If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other + thread can join before thread exits but it will not wait other thread to join. + + @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very + important that some thread joins on it after it terminates, otherwise + the resources of that thread are not reclaimed, causing memory leaks. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] detachstate Thread detach state. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate) +{ + if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){ + attr->detach_state = detachstate; + } +} + + +/**@ingroup func_qurt_thread_attr_set_timetest_id + Sets the thread timetest attribute.\n + Specifies the timetest identifier to use by a thread. + + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] timetest_id Timetest identifier value. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id) +{ + attr->timetest_id = timetest_id; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute.\n + Specifies the size of the memory area to use for a call stack of a thread. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_size Size (in bytes) of the thread stack. + + @return + None. + + @dependencies + None. +*/ + +static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size) +{ + attr->stack_size = stack_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size2 + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size + defined in the configuration XML.\n + Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] user_stack_size Size (in bytes) of the stack usage in User mode. + @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size) +{ + union qurt_thread_stack_info{ + unsigned int raw_size; + struct{ + unsigned short user_stack; + unsigned short root_stack; + }; + }user_root_stack_size; + user_root_stack_size.user_stack = user_stack_size; + user_root_stack_size.root_stack = root_stack_size; + + attr->stack_size = user_root_stack_size.raw_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_addr + @xreflabel{sec:set_stack_addr} + Sets the thread stack address attribute. \n + Specifies the base address of the memory area to use for a call stack of a thread. + + stack_addr must contain an address value that is 8-byte aligned. + + The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a + call stack for the thread. \n + @note1hang The user is responsible for allocating the memory area used for the thread + stack. The memory area must be large enough to contain the stack that the thread + creates. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_addr Pointer to the 8-byte aligned address of the thread stack. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr) +{ + attr->stack_addr = stack_addr; +} + +/**@ingroup func_qurt_thread_attr_set_bus_priority + Sets the internal bus priority state in the Hexagon core for this software thread attribute. + Memory requests generated by the thread with bus priority enabled are + given priority over requests generated by the thread with bus priority disabled. + The default value of bus priority is disabled. + + @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. + The priority is not propagated to the bus fabric. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + + @param[in] bus_priority Enabling flag. Values: \n + - #QURT_THREAD_BUS_PRIO_DISABLED \n + - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority) +{ + attr->bus_priority = (unsigned char)bus_priority; +} + +/**@ingroup func_qurt_thread_attr_set_autostack + Enables autostack v2 feature in the thread attributes. + + When autostack is enabled by the subsystem, in the case that + an autostack enabled thread gets framelimit exception, kernel will + allocate more stack for thread and return to normal execution. + + If autostack is not enabled by the subsystem, or it is not enabled + for the thread, the framelimit exception will be fatal. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] autostack Autostack enable or disable flag. Values: \n + - #QURT_THREAD_AUTOSTACK_DISABLED \n + - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack) +{ + attr->autostack = (unsigned char)autostack; +} +/**@ingroup qurt_thread_attr_enable_stid + Set STID in the thread attributes. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] enable_stid STID to be set. Values: \n + - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n + - #QURT_THREAD_ATTR_STID_ENABLE (1): QuRT assigns an STID that is not already in use \n + - #2 through #255 : User provided STID. @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid) +{ + if (enable_stid != '\0') { + attr->stid = enable_stid; + } + else + { + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + } +} + +/**@ingroup func_qurt_thread_attr_set_stid + Sets the stid thread attribute. + The default stid value is QURT_THREAD_ATTR_STID_DEFAULT + + @note1hang When a thread is created with non default stid , + the stid set in thread attribute will be assigned to a thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] stid Stid to be set for a thread. + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){ + attr->stid = stid; +} + +/**@ingroup func_qurt_thread_attr_set_group_id + Sets group id in the thread attributes. + Primordial/first thread has group ID 0. + If a new thread is created without assigning group_id, it + inherits the group ID from its parent thread. + + @note1hang + 1) Group ID can only be set before creating a thread. It cannot be + changed after the thread is created. + 2) If a non-activated group_id is passed, thread creation will fail. + 3) Only a thread with Group ID #0 can set Group ID for its child threads. + 4) If thread with non-zero group ID set the group ID for its child threads, + QuRT will ingore this parameter and child threads will inherit the parent + thread's group ID. But if passed group ID is not activated, thread creation + will still fail. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] group_id Group identifier. Its valid range is 0 ~ 63 + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id) +{ + attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK; +} + +/**@ingroup func_qurt_thread_set_autostack + Sets autostack enable in the TCB. + + @param[in] Pointer to UGP + + @return + None. + + @dependencies + None. +*/ + +void qurt_thread_set_autostack(void *); + + +/**@ingroup func_qurt_thread_get_name + Gets the thread name of current thread.\n + Returns the thread name of the current thread. + Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names + identify a thread during debugging or profiling. + + @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored. + @param[in] max_len Maximum length of the character string that can be returned. + + @return + None. + + @dependencies + None. +*/ +void qurt_thread_get_name (char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_create + @xreflabel{hdr:qurt_thread_create} + Creates a thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + @param[in] entrypoint C function pointer, which specifies the main function of a thread. + @param[in] arg Pointer to a thread-specific argument structure + + + @return + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg); + +/**@ingroup func_qurt_thread_stop + Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. + + @return + void + + @dependencies + None. + */ +void qurt_thread_stop(void); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_resume + When a demand-loading paging solution is enabled, this function + will resumes the execution of a thread that was suspended due to + a page miss. + + @param[in] thread_id Thread identifier. + + @return + #QURT_EOK -- Thread successfully resumed. \n + #QURT_EFATAL -- Resume operation failed. + + @dependencies + None. + */ +int qurt_thread_resume(unsigned int thread_id); +/** @endcond */ + +/**@ingroup func_qurt_thread_get_id + Gets the identifier of the current thread.\n + Returns the thread identifier for the current thread. + + @return + Thread identifier -- Identifier of the current thread. + + @dependencies + None. + */ +qurt_thread_t qurt_thread_get_id (void); + + +/**@ingroup func_qurt_thread_get_l2cache_partition + Returns the current value of the L2 cache partition assigned to the caller thread.\n + + @return + Value of the #qurt_cache_partition_t data type. + + @dependencies + None. + */ +qurt_cache_partition_t qurt_thread_get_l2cache_partition (void); + +/**@ingroup func_qurt_thread_set_timetest_id + Sets the timetest identifier of the current thread. + Timetest identifiers are used to identify a thread during debugging or profiling.\n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @param[in] tid Timetest identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_timetest_id (unsigned short tid); + +/**@ingroup func_qurt_thread_set_cache_partition + Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type + to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache. + + @datatypes + #qurt_cache_partition_t + + @param[in] l1_icache L1 I cache partition. + @param[in] l1_dcache L1 D cache partition. + @param[in] l2_cache L2 cache partition. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache); + + +/**@ingroup func_qurt_thread_get_timetest_id + Gets the timetest identifier of the current thread.\n + Returns the timetest identifier of the current thread.\n + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @return + Integer -- Timetest identifier. + + @dependencies + None. + */ +unsigned short qurt_thread_get_timetest_id (void); + +/**@ingroup func_qurt_thread_exit + @xreflabel{sec:qurt_thread_exit} + Stops the current thread, awakens threads joined to it, then destroys the stopped + thread. + + Threads that are suspended on the current thread (by performing a thread join + Section @xref{sec:thread_join}) are awakened and passed a user-defined status value + that indicates the status of the stopped thread. + + @note1hang Exit must be called in the context of the thread to stop. + + @param[in] status User-defined thread exit status value. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_exit(int status); + +/**@ingroup func_qurt_thread_join + @xreflabel{sec:thread_join} + Waits for a specified thread to finish; the specified thread is another thread within + the same process. + The caller thread is suspended until the specified thread exits. When the unspecified thread + exits, the caller thread is awakened. \n + @note1hang If the specified thread has already exited, this function returns immediately + with the result value #QURT_ENOTHREAD. \n + @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish. + If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}). + + @param[in] tid Thread identifier. + @param[out] status Destination variable for thread exit status. Returns an application-defined + value that indicates the termination status of the specified thread. + + @return + #QURT_ENOTHREAD -- Thread has already exited. \n + #QURT_EOK -- Thread successfully joined with valid status value. + + @dependencies + None. + */ +int qurt_thread_join(unsigned int tid, int *status); + +/**@ingroup qurt_thread_detach + @xreflabel{sec:thread_detach} + Detaches a joinable thread. The specified thread is another thread within the + same process. Create the thread as a joinable thread; only joinable threads + can be detached. + If a joinable thread is detached, it finishes execution and exits. + + @param[in] tid Thread identifier. + + @return + #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n + #QURT_EOK -- Thread successfully detached. + + @dependencies + None. + */ +int qurt_thread_detach(unsigned int tid); + + +/**@ingroup func_qurt_thread_get_priority + Gets the priority of the specified thread. \n + Returns the thread priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. \n + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + + @return + -1 -- Invalid thread identifier. \n + 1 through 254 -- Thread priority value. + + @dependencies + None. + */ +int qurt_thread_get_priority (qurt_thread_t threadid); + +/**@ingroup func_qurt_thread_set_priority + Sets the priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. For more + information, see Section @xref{sec:AppDev}. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + @param[in] newprio New thread priority value. + + @return + 0 -- Priority successfully set. \n + -1 -- Invalid thread identifier. \n + + @dependencies + None. + */ +int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio); + + + +/**@ingroup func_qurt_thread_attr_get + Gets the attributes of the specified thread. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[out] attr Pointer to the destination structure for thread attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid argument. + + @dependencies + None. + */ +int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr); + + + +/**@ingroup func_qurt_thread_get_tls_base + Gets the base address of thread local storage (TLS) of a dynamically loaded module + for the current thread. + + @datatypes + #qurt_tls_info + + @param[in] info Pointer to the TLS information for a module. + + @return + Pointer to the TLS object for the dynamically loaded module.\n + NULL -- TLS information is invalid. + + @dependencies + None. + */ +void * qurt_thread_get_tls_base(qurt_tls_info* info); + +/**@ingroup func_qurt_thread_pktcount_get + Gets the PKTCOUNT of a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + PKTCOUNT + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_get (qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_pktcount_set + Sets the PKTCOUNT for the current QuRT thread. + + @return + Value to which pktcount is set. + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_set (long long int); + +/**@ingroup func_qurt_thread_stid_get + Gets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + STID + + @dependencies + None. + */ + +char qurt_thread_stid_get(qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_stid_get2 + Returns the set stid for a thread + + @param[in] thread_id thread identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - success + QURT_ENOTALLOWED - operation not allowed for a thread + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid); + +/**@ingroup func_qurt_thread_stid_set + Sets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] stid Thread identifier. + + @return + #QURT_EOK -- STID set created. \n + #QURT_EFAILED -- STID not set. + + @dependencies + None. + */ + +int qurt_thread_stid_set(char stid); + +/**@ingroup qurt_thread_stid_set2 + Sets the stid for a specified thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[in] stid Stid to be set for a thread. + + @return + QURT_EOK -- Success + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_EVAL -- Failure because of invalid inputs. + + @dependencies + None. +*/ +int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_get_running_ids + Returns the thread IDs of the running threads in the system; use only during fatal error handling. + + @datatypes + #qurt_thread_t + + @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1. + + @return + #QURT_EINVALID -- Incorrect argument \n + #QURT_ENOTALLOWED -- API not called during error handling \n + #QURT_EOK -- Success, returns a NULL-terminated array of thread_id + + @dependencies + None. + */ +int qurt_thread_get_running_ids(qurt_thread_t *); +/** @endcond */ + + +/**@ingroup func_qurt_thread_get_thread_id + Gets the thread identifier of the thread with the matching name in the same process + of the caller. + + @datatypes + #qurt_thread_t + + @param[out] thread_id Pointer to the thread identifier. + @param[in] name Pointer to the name of the thread. + + @return + #QURT_EINVALID -- No thread with matching name in the process of the caller \n + #QURT_EOK -- Success + + @dependencies + None. + */ +int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name); + +/**@ingroup func_qurt_sleep + Suspends the current thread for the specified amount of time. + + @note1hang Because QuRT timers are deferrable, this call is guaranteed to block + at least for the specified amount of time. If power-collapse is + enabled, the maximum amount of time this call can block depends on + the earliest wakeup from power-collapse past the specified duration. + + @param[in] duration Duration (in microseconds) for which the thread is suspended. + + @return + None. + + @dependencies + None. + */ +void qurt_sleep (unsigned long long int duration); + + +/**@ingroup func_qurt_system_set_priority_floor + Sets a priority floor to move threads with thread priority lower than the floor out of the running state. + Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they + are not scheduled to run when the thread priority is lower than the floor. + Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. + Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor. + + The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and + sets a new floor, the new floor is associated to its original user process, not the QuRTOS process. + The floor associated to the user process is reset when the user process exits or is killed, but not at the time + when the user thread of the caller exits. + + The priority floor cannot be set to a priority higher than the thread priority of the caller. + + The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor. + + This function is not supported in Island mode. + + After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task + is not scheduled to run. + + @param[in] priority_floor Priority floor. + + @return + #QURT_EOK -- Success \n + #QURT_ENOTALLOWED -- Floor setting is not allowed + + @dependencies + None. + */ +int qurt_system_set_priority_floor (unsigned int priority_floor); + + +/**@ingroup func_qurt_thread_suspend_thread + Suspend a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent process. + After the target thread is suspended, the kernel will not schedule it to run until it is resumed later. + + If the target thread is set as non-suspendable, this function call returns an error without suspending + the target thread. + + If the target thread is already suspended, this function call returns success to confirm + the target thread suspend. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + suspending the target thread. + + If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend + the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is + suspended when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target + thread can runn in the guest OS, and is suspended when exiting the guest OS. + + QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend + those threads. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, multiple options can be ORed. \n + #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call, + the function returns after the thread is completely suspended.\n + #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns + after the kernel acts to suspend the target thread. The target thread + might still be running before it is completely suspended. \n + #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread + if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n + #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock(). + Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only + supported for the caller from the same user process of the target thread, not for a caller from the parent + process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX + context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations + and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option. + If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this + case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended + state without HMX detached. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process. + #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread. + + @dependencies + None. + */ +int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_resume_thread + Resume a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent + process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on + the thread priority. + + There is an option argument in this function, with only one default option as of now, + QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way. + + By default, this is an asynchronous function. The function returns after kernel moves the + target thread from suspended state to runnable state. The thread is scheduled to run based on its + thread priority. + + If the target thread is set as non-resumable, this function call does not resume the target thread. + + If the target thread has already resumed, this function confirms that the target thread resumes + by returning success. + + If the target thread is in a secure user process or CPZ process, this function call returns an error without + resuming the operation. + + If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of + suspend-pending on the target thread, and the target thread is not suspended when it exits the + guest OS. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + #QURT_EHMXNOTAVAIL -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume. + + @dependencies + None. + */ +int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_set_thread_property + Set a QuRT thread property with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be from the same user process of the target thread, or from its parent process. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + changing the property of the target thread. + + @param[in] thread_id Thread identifier \n + @param[in] property_id Thread property identifier \n + #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n + #QURT_THREAD_PROPERTY_RESUMEABLE -- thread is resumable. Default is TRUE + @param[in] value Proper value: \n + TRUE(1) -- TRUE for the property \n + FALSE(0) -- FALSE for the property + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value ); + +/**@ingroup func_qurt_thread_get_group_id + Get the group id of the thread specified by thread_id.\n + + @param[in] thread_id Thread identifier + @param[out] group_id Pointer to the variable of group identifier + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Thread id is invalid, or the process has no groups enabled \n + #QURT_ENOTALLOWED -- Operation is not allowed \n + + @dependencies + None. +*/ +int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id); + +#endif /* __ASSEMBLER__ */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread_context.h new file mode 100755 index 0000000000000..bab09deec8889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread_context.h @@ -0,0 +1,234 @@ +#ifndef QURT_THREAD_CONTEXT_H +#define QURT_THREAD_CONTEXT_H +/** + @file qurt_thread_context.h + @brief Kernel thread context structure + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond internal_only */ + +#define THREAD_ITERATOR_END ((qurt_thread_t)(-1)) /**< Thread iterator is complete. */ + + +/**@ingroup func_qurt_thread_iterator_create +Gives the ability to the caller to enumerate threads in the system. + +@return +Handle of the newly created iterator must be passed for +subsequent operations on the iterator. + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_create(void) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE); +} + +/**@ingroup func_qurt_thread_iterator_next +Iterates over the list of threads in the system. + +@datatypes +#qurt_thread_t + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n +Other values indicate a valid thread_id. + +@dependencies +None. +*/ +static inline qurt_thread_t qurt_thread_iterator_next(int iter) +{ + return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT); +} + +/**@ingroup func_qurt_thread_iterator_destroy +Cleans up thread iterator resources. + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#QURT_EOK -- Successful completion of operation \n +#QURT_EFATAL -- Invalid handle passed + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_destroy(int iter) +{ + return qurt_qdi_close(iter); +} + +/**@ingroup func_qurt_thread_context_get_tname +Gets the name of the thread from the specified thread ID. + +@param[in] thread_id Thread for which name is returned. +@param[in,out] name Pointer to the local buffer where name is copied back. +@param[in] max_len Size of the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_context_get_prio +Gets the priority for the specified thread. + +@param[in] thread_id Thread for which priority is returned. +@param[in,out] prio Pointer to the local variable where priority is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio); + +/**@ingroup func_qurt_thread_context_get_pcycles +Gets pcycles for the specified thread. + +@param[in] thread_id Thread for which processor cycles are returned. +@param[in,out] pcycles Pointer to the local variable where processor cycles are written. + +@return +#QURT_EOK -- Success \n +Failure otherwise. + +@dependencies +None. +*/ +int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles); + +/**@ingroup func_qurt_thread_context_get_stack_base +Gets the stack base address for the specified thread. + +@param[in] thread_id Thread for which stack base address is returned. +@param[in,out] sbase Pointer to the local variable where stack base address is written. + +@return +QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase); + +/**@ingroup func_qurt_thread_context_get_stack_size +Gets the stack size for the specified thread. + +@param[in] thread_id Thread for which stack size is returned. +@param[in,out] ssize Pointer to the local variable where stack size is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize); + +/**@ingroup func_qurt_thread_context_get_pid +Gets the process ID for the specified thread. + +@param[in] thread_id Thread for which process ID is returned. +@param[in,out] pid Pointer to the local variable where process id is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid); + +/**@ingroup func_qurt_thread_context_get_pname +Gets the process name for the specified thread. + +@param[in] thread_id Represents the thread for which process name is returned. +@param[in, out] name Pointer to the local buffer where process name is copied back. +@param[in] len Length allocated to the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len); + +/** @addtogroup thread_types +@{ */ +/** Structure that defines how TCB is interpreted to crash dump tools.*/ +/* Keys are defined in consts.h */ +struct qurt_debug_thread_info { +/** @cond */ + char name[QURT_MAX_NAME_LEN]; /**< Name of the thread. */ + struct { + unsigned key; + unsigned val; + } os_info[40]; + unsigned gen_regs[32]; /**< General mode registers. */ + unsigned user_cregs[32]; /**< User mode registers. */ + unsigned guest_cregs[32]; /**< Guest mode registers. */ + unsigned monitor_cregs[64]; /**< Monitor mode registers. */ +/** @endcond */ +}; /* should add up to 1K */ +/** @} */ /* end_addtogroup thread_types */ + + +/**@ingroup func_qurt_system_tcb_dump_get +Cleans up thread iterator resources. + +@datatypes +#qurt_thread_t + +@param[in] thread_id Thread on which the operation must be performed. +@param[in, out] ptr Pointer to the local buffer where contents are written. +@param[in] size Size of the debug thread information structure obtained by calling + qurt_system_tcb_dump_get_size(). + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_CONTEXT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_timer.h new file mode 100755 index 0000000000000..7bdfdb8f3c3df --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_timer.h @@ -0,0 +1,560 @@ +#ifndef QURT_TIMER_H +#define QURT_TIMER_H +/** + @file qurt_timer.h + @brief Prototypes of qurt_timer API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include "qurt_anysignal.h" +#include "qurt_signal2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@addtogroup timer_const_macros +@{ */ +/** + Default values. +*/ +/** @xreflabel{hdr:QURT_TIMER_ONESHOT}*/ +#define QURT_TIMER_DEFAULT_TYPE QURT_TIMER_ONESHOT /**< One shot.*/ +#define QURT_TIMER_DEFAULT_DURATION 1000uL /**< Default duration. */ +#define QURT_TIMER_DEFAULT_EXPIRY 0uL /**< Default expiration. */ + +/** + Conversion from microseconds to timer ticks. + */ +#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** Minimum microseconds value is 100 microseconds (sleep timer).*/ +#define QURT_TIMER_MIN_DURATION 100uL + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_TIMER_MAX_DURATION QURT_SYSCLOCK_MAX_DURATION + +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS + +/** + Sleep timer error margin for Qtimer is 1,000 ticks ~52 us. +*/ +#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN + +/* + qurt_timer group defines. +*/ +#define QURT_TIMER_MAX_GROUPS 5U /**< Maximum groups.*/ +#define QURT_TIMER_DEFAULT_GROUP 0U /**< Default groups. */ +/** @} */ /* end_addtogroup timer_const_macros */ + +/** @addtogroup timer_types +@{ */ +/** + QuRT timer types. + */ +typedef enum +{ + QURT_TIMER_ONESHOT = 0, /**< One shot.*/ + /** @xreflabel{hdr:QURT_TIMER_PERIODIC}*/ + QURT_TIMER_PERIODIC /**< Periodic. */ +} qurt_timer_type_t; + + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT timer type.*/ +typedef unsigned int qurt_timer_t; + +/** QuRT timer duration type. */ +typedef unsigned long long qurt_timer_duration_t; + +/** QuRT timer time type. */ +typedef unsigned long long qurt_timer_time_t; + +typedef void (*pfn_t)(void); +/** QuRT timer attribute type. */ +typedef struct +{ + /** @cond */ + unsigned int magic; /**< Magic number to verify the qmsgq_attr_t pointer. */ + + qurt_timer_duration_t duration; /**< Specifies the duration of the new timer. */ + + qurt_timer_time_t expiry; /**< Specifies the absolute expiry of the new timer. */ + + qurt_timer_duration_t remaining; /**< Specifies the remaining time of an active timer. */ + + qurt_timer_type_t type; /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and + #QURT_TIMER_PERIODIC are supported. */ + + unsigned int group; /**< Group number of the timer; the criterion used to disable or enable the set + of timers. */ + pfn_t pFn; /**< Callback other than the signal set */ + /** @endcond */ +} +qurt_timer_attr_t; + +/** @} */ /* end_addtogroup timer_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_timer_stop + @xreflabel{sec:qurt_timer_stop} + Stops a running timer. + The timer must be a one-shot timer. + + @note1hang Restart stopped timers with the timer restart operation, + see Section @xref{sec:qurt_timer_restart}. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n + #QURT_EMEM -- Out of memory error. + + @dependencies + None. + */ +int qurt_timer_stop (qurt_timer_t timer); + +/**@ingroup func_qurt_timer_restart + @xreflabel{sec:qurt_timer_restart} + Restarts a stopped timer with the specified duration. The timer must be a one-shot timer. + Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop(). + A restarted timer expires after the specified duration, the starting time is when the function is called. + + @note1hang Timers stop after they have expired or after they are explicitly + stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}. + + @datatypes + #qurt_timer_t \n + #qurt_timer_duration_t + + @param[in] timer Timer object. + @param[in] duration Timer duration (in microseconds) before the restarted timer + expires again. + The valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n + #QURT_EMEM -- Out-of-memory error. + + @dependencies + None. + */ +int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration); + + +/**@ingroup func_qurt_timer_create + Creates a timer.\n + Allocates and initializes a timer object, and starts the timer. + + @note1hang A timer event handler must be defined to wait on the specified signal + to handle the timer event. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t \n + #qurt_anysignal_t + + @param[out] timer Pointer to the created timer object. + @param[in] attr Pointer to the timer attribute structure. + @param[in] signal Pointer to the signal object set when timer expires. + @param[in] mask Signal mask, which specifies the signal to set in the signal object when the + time expires. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to create the timer. \n + #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n + Other error code -- Operation failed. \n + + @dependencies + None. + */ +int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_anysignal_t *signal, unsigned int mask); + +int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_timer_attr_init + Initializes the specified timer attribute structure with default attribute values: \n + - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n + - Timer type -- #QURT_TIMER_ONESHOT \n + - Timer group -- #QURT_TIMER_DEFAULT_GROUP + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_init(qurt_timer_attr_t *attr); + + +/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020 +@ingroup func_qurt_timer_attr_set_pfn + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + @param[in] pFn pFn. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn); + + +/**@ingroup func_qurt_timer_attr_set_duration + Sets the timer duration in the specified timer attribute structure.\n + + The timer duration specifies the interval (in microseconds) between the creation of the + timer object and the generation of the corresponding timer event. + + The timer duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] duration Timer duration (in microseconds). + Valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_attr_set_expiry + Sets the absolute expiry time in the specified timer attribute structure.\n + The timer expiry specifies the absolute time (in microseconds) of the generation of the + corresponding timer event.\n + Timer expiries are relative to when the system first began executing. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_time_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] time Timer expiry. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time); + +/**@ingroup func_qurt_timer_attr_get_duration + Gets the timer duration from the specified timer attribute structure. + The value returned is the duration that was originally set for the timer. + + @note1hang This function does not return the remaining time of an active timer; + use qurt_timer_attr_get_remaining() to get the remaining time. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attributes object + @param[out] duration Pointer to the destination variable for timer duration. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration); + +/**@ingroup func_qurt_timer_attr_get_remaining + Gets the timer remaining duration from the specified timer attribute structure. \n + + The timer remaining duration indicates (in microseconds) how much time remains before + the generation of the next timer event on the corresponding timer. + In most cases this function assumes that the timer attribute structure was obtained by + calling qurt_timer_get_attr(). + + @note1hang This attribute is read-only and thus has no set operation defined for it. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attribute object. + @param[out] remaining Pointer to the destination variable for remaining time. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining); + +/**@ingroup func_qurt_timer_attr_set_type + Sets the timer type in the specified timer attribute structure. + + The timer type specifies the functional behavior of the timer: \n + - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration + and then generates a single timer event. After this the timer is nonfunctional. \n + - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified + timer duration and then generates a timer event. The result is a series of timer + events with interval equal to the timer duration. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] type Timer type. Values are: \n + - #QURT_TIMER_ONESHOT -- One-shot timer. \n + - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type); + +/**@ingroup func_qurt_timer_attr_get_type + Gets the timer type from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] type Pointer to the destination variable for the timer type. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type); + +/**@ingroup func_qurt_timer_attr_set_group + Sets the timer group identifier in the specified timer attribute structure.\n + The timer group identifier specifies the group that the timer belongs to. Timer groups are + used to enable or disable one or more timers in a single operation. \n + The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1). + See Section @xref{dox:timers}. + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the timer attribute object. + @param[in] group Timer group identifier; + Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1). + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group); + +/**@ingroup func_qurt_timer_attr_get_group + Gets the timer group identifier from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] group Pointer to the destination variable for the timer group identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group); + +/**@ingroup func_qurt_timer_get_attr + @xreflabel{hdr:qurt_timer_get_attr} + Gets the timer attributes of the specified timer when it was created. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t + + @param[in] timer Timer object. + @param[out] attr Pointer to the destination structure for timer attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr); + +/**@ingroup func_qurt_timer_delete + Deletes the timer.\n + Destroys the specified timer and deallocates the timer object. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_delete(qurt_timer_t timer); + +/**@ingroup func_qurt_timer_sleep + Suspends the current thread for the specified amount of time. + The sleep duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). + + @datatypes + #qurt_timer_duration_t + + @param[in] duration Interval (in microseconds) between when the thread is suspended + and when it is re-awakened. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to perform the operation. + + @dependencies + None. + */ + +int qurt_timer_sleep(qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_group_disable + Disables all timers that are assigned to the specified timer group. + If a specified timer is already disabled, ignore it. + If a specified timer is expired, do not process it. + If the specified timer group is empty, do nothing. + + @note1hang When a timer is disabled its remaining time does not change, thus it + cannot generate a timer event. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_disable (unsigned int group); + +/**@ingroup func_qurt_timer_group_enable + Enables all timers that are assigned to the specified timer group. + If a specified timer is already enabled, ignore it. + If a specified timer is expired, process it. + If the specified timer group is empty, do nothing. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_enable (unsigned int group); + + +/** + Notifies the timer server recovery from power collapse. The server + must account for any missed interrupts during power collapse. + */ +void qurt_timer_recover_pc (void); + +/** + Determines whether the Qtimer is initialized. + + @return + 0 -- Not initialized. \n + Nonzero -- Initialized. + */ +static inline int qurt_timer_is_init (void) {return 1;} + +/**@ingroup func_qurt_timer_get_ticks + Gets current ticks. The ticks are accumulated since the RTOS + has started. Each tick is equal to a single timer clock + cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer. + + @return + Ticks since system started. + */ +unsigned long long qurt_timer_get_ticks (void); + +#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TIMER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tlb.h new file mode 100755 index 0000000000000..b1b2d261d31c0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tlb.h @@ -0,0 +1,215 @@ +#ifndef QURT_TLB_H +#define QURT_TLB_H + +/** + @file qurt_tlb.h + @brief Prototypes of TLB API + The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. + Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed + by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. + In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently + assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. + A new entry is placed in the first available slot. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tlb_entry_create + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (such as if the address is not aligned with the + size), the entry is created and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr Physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry is not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_create_64 + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (the address is not aligned with the + size), the entry is not created, and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the asid argument to -1. + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr_64 64-bit physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry was not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_delete + Deletes the specified TLB entry from the TLB of the Hexagon processor. + If the specified entry does not exist, no deletion occurs and an error result is returned. + + @param[in] entry_id TLB entry identifier. + + @return + #QURT_EOK -- TLB entry successfully deleted. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_delete (unsigned int entry_id); + +/**@ingroup func_qurt_tlb_entry_query + Searches for the specified TLB entry in the TLB of the Hexagon processor. + If the TLB entry is found, its entry identifier is returned. + + @datatypes + #qurt_addr_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid); + +/**@ingroup func_qurt_tlb_entry_set + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[in] entry 64-bit TLB entry to store. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry); + +/**@ingroup func_qurt_tlb_entry_get + Gets the TLB entry. \n + Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[out] entry 64-bit TLB entry. + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry); + +/**@ingroup func_qurt_tlb_get_pager_physaddrs + Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_phys_addrs Pointer to the return array of pager physical addresses. + + @return + Integer -- Number of addresses returned in array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs); + +/**@ingroup func_qurt_tlb_get_pager_virtaddr + Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_virt_addrs Pointer to the return array of pager virtual addresses. + + @return + Integer -- Number of addresses returned in the array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs); + + +/**@ingroup func_qurt_tlb_entry_set2 + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. An additional option can be passed + to lock the TLB entry in the TLB of the Hexagon processor. + + @param[in] id TLB entry identifier. + @param[in] tlb 64-bit TLB entry to store. + @param[in] lock Nonzero value indicates that the TLB entry must be locked in the hardware TLB. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLB_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tls.h new file mode 100755 index 0000000000000..6ec3b39ff5cb0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tls.h @@ -0,0 +1,100 @@ +#ifndef QURT_TLS_H +#define QURT_TLS_H +/** + @file qurt_tls.h + @brief Prototypes of TLS APIs + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tls_create_key + @xreflabel{sec:tls_create_key} + Creates a key for accessing a thread local storage data item.\n + Subsequent get and set operations use the key value. + + @note1hang The destructor function performs any clean-up operations needed by a thread + local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}). + + @param[out] key Pointer to the newly created thread local storage key value. + @param[in] destructor Pointer to the key-specific destructor function. Passing NULL + specifies that no destructor function is defined for the key. + + @return + #QURT_EOK -- Key successfully created. \n + #QURT_ETLSAVAIL -- No free TLS key available. + + @dependencies + None. + */ +int qurt_tls_create_key (int *key, void (*destructor)(void *)); + +/**@ingroup func_qurt_tls_set_specific + Stores a data item to thread local storage along with the specified key. + + @param[in] key Thread local storage key value. + @param[in] value Pointer to user data value to store. + + @return + #QURT_EOK -- Data item successfully stored. \n + #QURT_EINVALID -- Invalid key. \n + #QURT_EFAILED -- Invoked from a non-thread context. + */ +int qurt_tls_set_specific (int key, const void *value); + +/**@ingroup func_qurt_tls_get_specific + Loads the data item from thread local storage. \n + Returns the data item that is stored in thread local storage with the specified key. + The data item is always a pointer to user data. + + @param[in] key Thread local storage key value. + + @return + Pointer -- Data item indexed by key in thread local storage. \n + 0 (NULL) -- Key out of range. + + @dependencies + None. + */ +void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key); + + +/**@ingroup func_qurt_tls_delete_key + Deletes the specified key from thread local storage. + + @note1hang Explicitly deleting a key does not execute any destructor function that is + associated with the key (Section @xref{sec:tls_create_key}). + + @param[in] key Thread local storage key value to delete. + + @return + #QURT_EOK -- Key successfully deleted. \n + #QURT_ETLSENTRY -- Key already free. + + @dependencies + None. + */ +int qurt_tls_delete_key (int key); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_trace.h new file mode 100755 index 0000000000000..541f8f1d34bf6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_trace.h @@ -0,0 +1,317 @@ +#ifndef QURT_TRACE_H +#define QURT_TRACE_H +/** + @file qurt_trace.h + @brief Prototypes of system call tracing helpers API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + GLOBAL VARIABLES +=============================================================================*/ +/** @cond internal_only */ +/** @addtogroup etm_macros +@{ */ +/* ETM trace types. */ +#define QURT_ETM_TYPE_PC_ADDR (1U<<0) /**< PC address.*/ +#define QURT_ETM_TYPE_MEMORY_ADDR (1U<<1) /**< Memory address. */ +#define QURT_ETM_TYPE_TESTBUS (1U<<2) /**< Test bus. */ +#define QURT_ETM_TYPE_CYCLE_ACCURATE (1U<<3) /**< Cycle accurate. */ +#define QURT_ETM_TYPE_CYCLE_COARSE (1U<<4) /**< Cycle coarse. */ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */ +#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */ +#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */ + +/* ETM routes. */ +#define QURT_ETM_ROUTE_TO_QDSS 0U /**< ETM route to QDSS. */ +#define QURT_ETM_ROUTE_TO_Q6ETB 1U /**< ETM route to Q6ETB. */ + +/* ETM filters. */ +#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT 0U /*< Filter all as default. */ +#define QURT_ETM_TRACE_FILTER_HNUM0 (1U<<0) /*< Filter HNUM0. */ +#define QURT_ETM_TRACE_FILTER_HNUM1 (1U<<1) /*< Filter HNUM1. */ +#define QURT_ETM_TRACE_FILTER_HNUM2 (1U<<2) /*< Filter HNUM2. */ +#define QURT_ETM_TRACE_FILTER_HNUM3 (1U<<3) /*< Filter HNUM3. */ +#define QURT_ETM_TRACE_FILTER_HNUM4 (1U<<4) /*< Filter HNUM4. */ +#define QURT_ETM_TRACE_FILTER_HNUM5 (1U<<5) /*< Filter HNUM5. */ +#define QURT_ETM_TRACE_FILTER_HNUM6 (1U<<6) /*< Filter HNUM6. */ +#define QURT_ETM_TRACE_FILTER_HNUM7 (1U<<7) /*< Filter HNUM7. */ +#define QURT_ETM_TRACE_FILTER_HNUM8 (1U<<8) /*< Filter HNUM8. */ +#define QURT_ETM_TRACE_FILTER_HNUM9 (1U<<9) /*< Filter HNUM9. */ +#define QURT_ETM_TRACE_FILTER_HNUM10 (1U<<10) /*< Filter HNUM10. */ +#define QURT_ETM_TRACE_FILTER_HNUM11 (1U<<11) /*< Filter HNUM11. */ +#define QURT_ETM_TRACE_FILTER_HNUM12 (1U<<12) /*< Filter HNUM12. */ +#define QURT_ETM_TRACE_FILTER_HNUM13 (1U<<13) /*< Filter HNUM13. */ +#define QURT_ETM_TRACE_FILTER_HNUM14 (1U<<14) /*< Filter HNUM14. */ +#define QURT_ETM_TRACE_FILTER_HNUM15 (1U<<15) /*< Filter HNUM15. */ +#define QURT_ETM_TRACE_FILTER_ALL QURT_ETM_TRACE_FILTER_ALL_DEFAULT + +#define QURT_ETM_TRACE_FILTER_CLUSTER0 (1<<16) /*< Filter trace cluster0 address. */ +#define QURT_ETM_TRACE_FILTER_CLUSTER1 (1<<17) /*< Filter trace cluster1 address. */ +#define QURT_ETM_TRACE_FILTER_PC_RANGE (1<<19) /*< Filter PC address range. */ + +/* ETM memory source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< ETM memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< ETM memory source of SAC* is data. */ + +/* Period between synchronization traces */ +#define QURT_ETM_ASYNC_PERIOD 0 /**< Async.*/ +#define QURT_ETM_ISYNC_PERIOD 1 /**< Isync.*/ +#define QURT_ETM_GSYNC_PERIOD 2 /**< Gsync. */ + +/* ETM enable flags */ +#define QURT_ETM_OFF 0U /**< ETM off. */ +#define QURT_ETM_ON 1U /**< ETM on. */ +/** @endcond */ +/** @} */ /* end_addtogroup etm_macros */ + +/** @addtogroup function_tracing_macro +@{ */ +/* ETM setup return values */ +#define QURT_ETM_SETUP_OK 0 /**< ETM setup OK. */ +#define QURT_ETM_SETUP_ERR 1 /**< ETM setup error. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* ETM breakpoint types */ +#define QURT_ETM_READWRITE_BRKPT 0U /**< ETM read/write breakpoint. */ +#define QURT_ETM_READ_BRKPT 1U /**< ETM read breakpoint. */ +#define QURT_ETM_WRITE_BRKPT 2U /**< ETM write breakpoint. */ +#define QURT_ETM_BRKPT_INVALIDATE 3U /**< Invalidate breakpoint. */ +/** @addtogroup function_tracing_macro +@{ */ +/* ATB status flags */ +#define QURT_ATB_OFF 0 /**< ATB off. */ +#define QURT_ATB_ON 1 /**< ATB on. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* DTM enable flags */ +#define QURT_DTM_OFF 0 /**< DTM off. */ +#define QURT_DTM_ON 1 /**< DTM on. */ + +/** @addtogroup function_tracing_datatypes +@{ */ +/**STM trace information. */ +typedef struct qurt_stm_trace_info { + /** @cond */ + unsigned int stm_port_addr[6]; /* STM port address to which trace data must be written.*/ + unsigned int thread_event_id; /* Event ID for context switches.*/ + unsigned int interrupt_event_id; /* Event ID for interrupts. */ + unsigned int marker; /* Marker value that must be written at the beginning of the trace. */ + /** @endcond */ +} qurt_stm_trace_info_t; +/** @} */ /* end_addtogroup function_tracing_datatypes */ +/*============================================================================= + GLOBAL FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_trace_get_marker + Gets the kernel trace marker.\n + Returns the current value of the kernel trace marker. + The marker consists of a hardware thread identifier and an index into the kernel trace + buffer. The trace buffer records kernel events. + + @note1hang Using this function with qurt_trace_changed() + determines whether certain kernel events occurred in a block of code. + + @return + Integer -- Kernel trace marker. + + @dependencies + None. +*/ +unsigned int qurt_trace_get_marker(void); + +/**@ingroup func_qurt_trace_changed + Determines whether specific kernel events have occurred. \n + Returns a value that indicates whether the specified kernel events are recorded in the + kernel trace buffer since the specified kernel trace marker was obtained. + + The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling + qurt_trace_get_marker(). + @cond rest_dist For more information on the mask value, see the description of the trace_mask element in + @xhyperref{80VB41992,80-VB419-92}. \n @endcond + + @note1hang Used with qurt_trace_get_marker(), this function determines whether + certain kernel events occurred in a block of code.\n + @note1cont This function cannot determine whether a specific kernel event type has + occurred unless that event type has been enabled in the trace_mask element + of the system configuration file. \n + @note1cont QuRT supports the recording of interrupt and context switch events only (such as + a trace_mask value of 0x3). + + @param[in] prev_trace_marker Previous kernel trace marker. + @param[in] trace_mask Mask value that indicates which kernel events to check for. + + @returns + 1 -- Kernel events of the specified type have occurred since the + specified trace marker was obtained.\n + 0 -- No kernel events of the specified type have occurred since the + specified trace marker was obtained. + + @dependencies + None. +*/ +int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask); + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup function_tracing_macro +@{ */ +#ifndef QURT_DEBUG +#define QURT_TRACE(str, ...) __VA_ARGS__ + /**< Function tracing is implemented with the QURT_TRACE debug macro, which + optionally generates printf statements both before and after every function call that is + passed as a macro argument. + + For example, in the following macro calls in the source code: + @code + QURT_TRACE(myfunc, my_func(33)) + + @endcode + generates the following debug output: + @code + myfile:nnn: my_func >>> calling my_func(33) + myfile:nnn: my_func >>> returned my_func(33) + @endcode + The debug output includes the source file and line number of the function call, along with + the text of the call. Compile the client source file with -D __FILENAME__ + defined for its file name. + + The library function qurt_printf() generates the debug output. + The QURT_DEBUG symbol controls generation of the debug output. If this symbol is + not defined, function tracing is not generated.\n + @note1hang The debug macro is accessed through the QuRT API header file. + */ +#else +#define QURT_TRACE(str, ...) \ + do { \ + qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + __VA_ARGS__; \ + qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + } while (0); +#endif +/** @} */ /* end_addtogroup function_tracing_macro */ + +/**@ingroup func_qurt_etm_set_pc_range + Sets the PC address range for ETM filtering. + Depending on the Hexagon core design, a maximum of four PC ranges are supported. + + @param[in] range_num 0 to 3. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_range + Sets the address range for ETM filtering. + It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA. + + @param[in] addr_source_type Type of the address source:\n + - #QURT_ETM_SOURCE_PC \n + - #QURT_ETM_SOURCE_DATA @tablebulletend + @param[in] trig_block_num 0 to 3. + @param[in] pid pid of the process + 1. Any valid PID number will enable the ASID based trace filtering. + 2. QURT_ETM_NO_PID - Disable the ASID based trace filtering. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_atb + Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled. + QuRT performs the corresponding actions at low power management. + + @param[in] flag Values: \n + #QURT_ATB_ON \n + #QURT_ATB_OFF + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure + + @dependencies + None. +*/ +unsigned int qurt_etm_set_atb(unsigned int flag); + +/**@ingroup func_qurt_etm_set_sync_period + Sets the period for types of synchronization trace packets. \n + ASYNC defines the period between alignment synchronization packets. + Period is in terms of bytes in the packet stream. \n + ISYNC defines the period between instruction synchronization packets. + Period is per thread and is defined as the bytes sent out for that thread. \n + GSYNC is the defined period in thread cycles between GSYNC packets. + + @param[in] sync_type Type of synchronization packets: \n + #QURT_ETM_ASYNC_PERIOD \n + #QURT_ETM_ISYNC_PERIOD \n + #QURT_ETM_GSYNC_PERIOD + @param[in] period Period value. + + @return + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. + */ +unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period); + +/**@ingroup func_qurt_stm_trace_set_config + Sets up a STM port for tracing events. + + @datatypes + #qurt_stm_trace_info_t + + @param[in] stm_config_info Pointer to the STM trace information used to set up the trace + in the kernel. + The strucure must have the following:\n + - One port address per hardware thread \n + - Event ID for context switches \n + - Event ID for interrupt tracing n + - Header or marker to identify the beginning of the trace. @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table. + + @dependencies + None. + */ +unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TRACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_types.h new file mode 100755 index 0000000000000..bdb83a3fe2fb2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_types.h @@ -0,0 +1,294 @@ +#ifndef QURT_TYPES_H +#define QURT_TYPES_H +/** + @file qurt_types.h + @brief Contains types common to all configurations + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +//#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define PGA_BITFIELD_MASK(hi,lo) (((~0u)>>(31U-((hi)-(lo))))<<(lo)) +#define PGA_BITFIELD_GET(x,hi,lo) (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo)) +#define PGA_BITFIELD_INS(hi,lo,v) (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo))) +#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v))) +#define QURT_PGATTR_C_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 3U, 0U) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 5U, 4U) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_C_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v)) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v)) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_MKRAW(v) ((qurt_pgattr_t){.pga_value = (v)}) +#define QURT_PGATTR_MK(c,a) QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a))) + +/*return types for qurt_island_get_status2*/ +#define QURT_ISLAND_MODE_NORMAL 0U /**< Normal operating mode */ +#define QURT_ISLAND_MODE_ISLAND 1U /**< Island mode */ +#define QURT_ISLAND_MODE_EXITING 2U /**< In transition from Island mode to Normal mode */ + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/** @addtogroup memory_management_types +@{ */ +typedef unsigned int qurt_addr_t; /**< QuRT address type.*/ +typedef unsigned int qurt_paddr_t; /**< QuRT physical memory address type. */ +/** @cond rest_reg_dist */ +typedef unsigned long long qurt_addr_64_t; /**< QuRT 64-bit memory address type. */ +typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */ +typedef unsigned int qurt_mem_region_t; /**< QuRT memory regions type. */ +typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */ +/**@endcond */ +typedef unsigned int qurt_mem_pool_t; /**< QuRT memory pool type.*/ +typedef unsigned int qurt_size_t; /**< QuRT size type. */ +/** @cond */ +typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */ +#define QURT_PHYSPOOL_NAME_LEN (32) +typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN]; + + +/* + * Mapping type + * + * QMEM_MAPPING_VIRTUAL is the default mode, in which the system + * picks up the available range of the virtual address, and maps it to + * available contiguous physical addresses. Physical-to-virtual + * is not guaranteed to be 1:1; both virtual and physical memory is + * contiguous. + * + * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address; + * the kernel allocates 1:1 physical-to-virtual memory. Primary use of + * of this mapping is to allocate physical-to-virtual memory 1:1. + * + * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might + * not be the same as the physical address. But the physical address of the + * memory region is guaranteed to be contiguous starting at the provided + * address, it is required to provide a fixed physical address. The primary + * use of this mapping is to allocate physical memory from a particular + * address, where 1:1 physical-to-virtual is not required. + * + * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory + * area (VMA); no physical memory is reserved or mapped to this virtual + * space; all standard qmem_region APIs apply to a VMA, however physical + * address is always INVALID_ADDR. qmem_region_create() in this mode + * returns a handle to the VMA, both virt_addr and phys_addr must + * be set to INVALID_ADDR, kernel allocates any available virtual + * memory of the specified size. Obtain the starting virtual address + * of VMA through qmem_region_attr_getvirtaddr(). + * Primary purpose of this mapping mode is to provide a mechanism for + * delayed binding in QuRT, for example reserve virtual memory and map it at + * some later time to possibly discontiguous physical blocks. Thus, a + * single VMA can be partitioned among several physical-virtual mappings + * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode. + * Each VMA keeps track of associated mapped regions. + * Deletion of VMA succeeds only if all associated "virtual_fixed" + * regions are freed prior to VMA deletion. + * + * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region + * from virtual space that has been reserved via qmem_region_create() + * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if + * phys_addr is specified, the kernel attempts to map it accordingly, + * if no phys_addr is specified, kernel maps any available physical + * memory. All standard qmem_region APIs apply to such region. Remapping + * a virtual range without prior freeing of the region is not permitted. + * When such region is deleted its corresponding VMA remains intact. + * + * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous + * virtual memory but physical memory can be discontiguous. This method + * tries to club small physical memory blocks to obtain requested + * memory and is useful in case where there is no contiguous full block + * of requested size. If client does not need contiguous physical memory, + * (for example, if client does not use physical addressing), this helps + * use smaller physical memory blocks rather than using contiguous memory. + * Note: When memory is allocated through this method, physical address is + * not returned to the caller using the qurt_mem_region_attr_get() API as there might + * not be a single physical address. + * + */ +/**@endcond */ +/** QuRT memory region mapping type. */ +typedef enum { + QURT_MEM_MAPPING_VIRTUAL=0, /**< Default mode. The region virtual address range maps to an + available contiguous area of physical memory. For the most + efficient use of virtual memory, the QuRT system + chooses the base address in physical memory. This works for most memory + use cases.*/ + QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1, /**< The region virtual address space must be mapped to a + contiguous area of physical memory. This is necessary when the + memory region is accessed by external devices that bypass Hexagon + virtual memory addressing. The base address in physical + memory must be explicitly specified.*/ + QURT_MEM_MAPPING_IDEMPOTENT=2, /**< Region virtual address space maps + to the identical area of physical memory. */ + QURT_MEM_MAPPING_VIRTUAL_FIXED=3, /**< Virtual address space of the region maps either to the + specified area of physical memory or (if no area is specified) + to available physical memory. Use this mapping to create + regions from virtual space that was reserved by calling + qurt_mem_region_create() with mapping. */ + QURT_MEM_MAPPING_NONE=4, /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not + permitted without first deleting the memory region. When such a region is + deleted, its corresponding virtual memory addressing remains intact. */ + QURT_MEM_MAPPING_VIRTUAL_RANDOM=7, /**< System chooses a random virtual address and + maps it to available contiguous physical addresses.*/ + QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical + memory blocks. This helps when there are smaller contiguous blocks + than the requested size. + Physical address is not provided as part of the get_attr call */ + QURT_MEM_MAPPING_INVALID=10, /**< Reserved as an invalid mapping type. */ +} qurt_mem_mapping_t; + + +/** QuRT cache mode type. */ +typedef enum { + QURT_MEM_CACHE_WRITEBACK=7, /**< Write back. */ + QURT_MEM_CACHE_NONE_SHARED=6, /**< Normal uncached memory that can be shared with other subsystems.*/ + QURT_MEM_CACHE_WRITETHROUGH=5, /**< Write through. */ + QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0, /**< Write back non-L2-cacheable.*/ + QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1, /**< Write through non-L2-cacheable. */ + QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK, /**< Write back L2 cacheable. */ + QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH, /**< Write through L2 cacheable. */ + QURT_MEM_CACHE_DEVICE = 4, /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/ + QURT_MEM_CACHE_NONE = 4, /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */ + QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */ + QURT_MEM_CACHE_INVALID=10, /**< Reserved as an invalid cache type. */ +} qurt_mem_cache_mode_t; + +/** Memory access permission. */ +#define QURT_PERM_NONE 0x0U /**< No permission. */ +#define QURT_PERM_READ 0x1U /**< Read permission. */ +#define QURT_PERM_WRITE 0x2U /**< Write permission. */ +#define QURT_PERM_EXECUTE 0x4U /**< Execution permission. */ +#define QURT_PERM_NODUMP 0x8U + /**< Skip dumping the mapping. During process domain dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and DSP process + crashed before the mapping is removed. */ +#define QURT_PERM_FULL QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE /**< Read, write, and execute permission. */ + +typedef unsigned char qurt_perm_t; + + +/** @cond rest_reg_dist*/ +/** QuRT cache type; specifies data cache or instruction cache. */ +typedef enum { + QURT_MEM_ICACHE, /**< Instruction cache.*/ + QURT_MEM_DCACHE /**< Data cache.*/ +} qurt_mem_cache_type_t; + +/** QuRT cache operation code type. */ +typedef enum { + QURT_MEM_CACHE_FLUSH, /**< Flush. */ + QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */ + QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */ + QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */ + QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/ +} qurt_mem_cache_op_t; + +/** QuRT memory region type. */ +typedef enum { + QURT_MEM_REGION_LOCAL=0, /**< Local. */ + QURT_MEM_REGION_SHARED=1, /**< Shared.*/ + QURT_MEM_REGION_USER_ACCESS=2, /**< User access. */ + QURT_MEM_REGION_FS=4, /**< FS. */ + QURT_MEM_REGION_INVALID=10, /**< Reserved as an invalid region type. */ +} qurt_mem_region_type_t; + +/* Cache and bus attributes are combined into a value of this type for convenience, + and macros for combining and extracting fields are defined here. */ +/** @cond */ +struct qurt_pgattr { + unsigned pga_value; /**< PGA value.*/ +}; +typedef struct qurt_pgattr qurt_pgattr_t; +/** @endcond */ +/** QuRT memory region attributes type.*/ +/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr. + virtaddr cannot be specified for a memory region, it can only be queried by the + qmem_attr_getvirtaddr() function. + */ +typedef struct { + /** @cond */ + qurt_mem_mapping_t mapping_type; + unsigned char perms; + unsigned short owner; + qurt_pgattr_t pga; + unsigned ppn; //physical page number (physical>>12) + qurt_addr_t virtaddr; + qurt_mem_region_type_t type; + qurt_size_t size; + /** @endcond */ +} qurt_mem_region_attr_t; + + +/** QuRT user physical memory pool type. */ +typedef struct { + /** @cond */ + char name[32]; + struct ranges{ + unsigned int start; + unsigned int size; + } ranges[MAX_POOL_RANGES]; + /** @endcond */ +} qurt_mem_pool_attr_t; + +/** QuRT memory pool status type.*/ +typedef struct _qurt_mem_pool_status { + + qurt_size_t contig_size; /**< Largest contiguous free memory in bytes. */ + qurt_size_t free_size; /**< Total free memory in bytes. */ + qurt_size_t total_size; /**< Total declared memory in bytes. */ + +} qurt_mem_pool_status_t; + +typedef enum { + HEXAGON_L1_I_CACHE = 0, /**< Hexagon L1 instruction cache. */ + HEXAGON_L1_D_CACHE = 1, /**< Hexagon L1 data cache. */ + HEXAGON_L2_CACHE = 2 /**< Hexagon L2 cache. */ +} qurt_cache_type_t; + +typedef enum { + FULL_SIZE = 0, /**< Fully shared cache, without partitioning. */ + HALF_SIZE = 1, /**< 1/2 for main, 1/2 for auxiliary. */ + THREE_QUARTER_SIZE = 2, /**< 3/4 for main, 1/4 for auxiliary. */ + SEVEN_EIGHTHS_SIZE = 3 /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */ +} qurt_cache_partition_size_t; + +typedef enum { + QURT_PROCESS_CB_GENERIC, /**< generic unconditional cb called after image loading. */ + QURT_PROCESS_NOTE_CB_PRE_MAP, /**< note cb called before segment loading. */ + QURT_PROCESS_NOTE_CB_POST_MAP /**< note cb called after segment loading. */ +} qurt_process_cb_type_t; + +typedef union { + void *ptr; + int num; +} qurt_process_callback_arg_t; + + +/**@endcond*/ + +/** @} */ /* end_addtogroup memory_management_types */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TYPES_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_user_dma.h new file mode 100755 index 0000000000000..e05a6429fd703 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_user_dma.h @@ -0,0 +1,44 @@ +#ifndef QURT_USER_DMA_H +#define QURT_USER_DMA_H + +/** + @file qurt_user_dma.h + @brief Definitions, macros, and prototypes used for handling user DMA. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_user_dma_dmsyncht + Sends the DMSyncht command to the user DMA engine. + + Call this function to ensure all posted DMA memory operations are + complete. + + This stalls the current thread until the instruction + is complete and returns. + + @return + QURT_EOK - On dmsyncht completion \n + QURT_ENOTSUPPORTED - User DMA not supported + + @dependencies + None. +*/ +int qurt_user_dma_dmsyncht(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_vtlb.h new file mode 100755 index 0000000000000..e064042e447ac --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_vtlb.h @@ -0,0 +1,76 @@ +/*============================================================================= + + qurt_vtlb.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef QURT_VTLB_H +#define QURT_VTLB_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Names starting with "qurt_i_vtlb" are the internal low-level functions. +|| These should be considered subject to change. +*/ + +int qurt_i_vtlb_entry_create(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension); + +int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension, + unsigned target_pid); + +int qurt_i_vtlb_entry_delete(unsigned index); + +int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo); + +int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension); + +int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid); + +int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex); + +int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid); + + +int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries + // stats[1] -- number of available VTLB entries + // stats[2] -- max size of VTLB tree since boot + +//can return index to an entry that was specialed, change it to take addresses instead of pages +int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size); + +int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index); + +#define QURT_VTLB_EXT_DEFAULT 0U +#define QURT_VTLB_EXT_LOCKED 1U +#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U /* Temporary ability to skip certain mappings in pd dump */ +#define QURT_VTLB_EXT_FREELIST 0x800000u + +#define QURT_VTLB_ERR_OVERLAP -64 +#define QURT_VTLB_ERR_TREE_NO_SPACE -65 +#define QURT_VTLB_ERR_INVALID_SIZE -68 +#define QURT_VTLB_ERR_INVALID_EXT -69 +#define QURT_VTLB_ERR_DEL_PGT_LOCKED -70 +#define QURT_VTLB_ERR_PGT_LOCK_CNT -71 + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_VTLB_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libposix.a new file mode 100755 index 0000000000000..ca0bdbacb0604 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurt.a new file mode 100755 index 0000000000000..91fc230d94d3b Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurtcfs.a new file mode 100755 index 0000000000000..e7a8102d8cb40 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_island.a new file mode 100755 index 0000000000000..32ce17efe453e Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_island.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_main.a new file mode 100755 index 0000000000000..a67c32e005b95 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_main.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libposix.a new file mode 100755 index 0000000000000..1e0afa4db765b Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurt.a new file mode 100755 index 0000000000000..fff03b0877eb8 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurtcfs.a new file mode 100755 index 0000000000000..e7a8102d8cb40 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libtimer.a new file mode 100755 index 0000000000000..cd856bdb8c5cf Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libtimer.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/confname.h new file mode 100755 index 0000000000000..d9ca3135501e3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/confname.h @@ -0,0 +1,528 @@ +#ifndef CONFNAME_H +#define CONFNAME_H +/** + @file confname.h + @brief Named literals for 'name' argument of sysconf, pathconf + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly. Instead include unistd.h. For now since + toolchain doesnt provide a hook by including bits/confname.h, we stick this + header in QuRT's sys/types.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +/* Values for the NAME argument to `pathconf' and `fpathconf'. */ +enum +{ + _PC_LINK_MAX, +#define _PC_LINK_MAX _PC_LINK_MAX + _PC_MAX_CANON, +#define _PC_MAX_CANON _PC_MAX_CANON + _PC_MAX_INPUT, +#define _PC_MAX_INPUT _PC_MAX_INPUT + _PC_NAME_MAX, +#define _PC_NAME_MAX _PC_NAME_MAX + _PC_PATH_MAX, +#define _PC_PATH_MAX _PC_PATH_MAX + _PC_PIPE_BUF, +#define _PC_PIPE_BUF _PC_PIPE_BUF + _PC_CHOWN_RESTRICTED, +#define _PC_CHOWN_RESTRICTED _PC_CHOWN_RESTRICTED + _PC_NO_TRUNC, +#define _PC_NO_TRUNC _PC_NO_TRUNC + _PC_VDISABLE, +#define _PC_VDISABLE _PC_VDISABLE + _PC_SYNC_IO, +#define _PC_SYNC_IO _PC_SYNC_IO + _PC_ASYNC_IO, +#define _PC_ASYNC_IO _PC_ASYNC_IO + _PC_PRIO_IO, +#define _PC_PRIO_IO _PC_PRIO_IO + _PC_SOCK_MAXBUF, +#define _PC_SOCK_MAXBUF _PC_SOCK_MAXBUF + _PC_FILESIZEBITS, +#define _PC_FILESIZEBITS _PC_FILESIZEBITS + _PC_REC_INCR_XFER_SIZE, +#define _PC_REC_INCR_XFER_SIZE _PC_REC_INCR_XFER_SIZE + _PC_REC_MAX_XFER_SIZE, +#define _PC_REC_MAX_XFER_SIZE _PC_REC_MAX_XFER_SIZE + _PC_REC_MIN_XFER_SIZE, +#define _PC_REC_MIN_XFER_SIZE _PC_REC_MIN_XFER_SIZE + _PC_REC_XFER_ALIGN, +#define _PC_REC_XFER_ALIGN _PC_REC_XFER_ALIGN + _PC_ALLOC_SIZE_MIN, +#define _PC_ALLOC_SIZE_MIN _PC_ALLOC_SIZE_MIN + _PC_SYMLINK_MAX, +#define _PC_SYMLINK_MAX _PC_SYMLINK_MAX + _PC_2_SYMLINKS +#define _PC_2_SYMLINKS _PC_2_SYMLINKS +}; + +/* Values for the argument to `sysconf'. */ +enum +{ + _SC_ARG_MAX, +#define _SC_ARG_MAX _SC_ARG_MAX + _SC_CHILD_MAX, +#define _SC_CHILD_MAX _SC_CHILD_MAX + _SC_CLK_TCK, +#define _SC_CLK_TCK _SC_CLK_TCK + _SC_NGROUPS_MAX, +#define _SC_NGROUPS_MAX _SC_NGROUPS_MAX + _SC_OPEN_MAX, +#define _SC_OPEN_MAX _SC_OPEN_MAX + _SC_STREAM_MAX, +#define _SC_STREAM_MAX _SC_STREAM_MAX + _SC_TZNAME_MAX, +#define _SC_TZNAME_MAX _SC_TZNAME_MAX + _SC_JOB_CONTROL, +#define _SC_JOB_CONTROL _SC_JOB_CONTROL + _SC_SAVED_IDS, +#define _SC_SAVED_IDS _SC_SAVED_IDS + _SC_REALTIME_SIGNALS, +#define _SC_REALTIME_SIGNALS _SC_REALTIME_SIGNALS + _SC_PRIORITY_SCHEDULING, +#define _SC_PRIORITY_SCHEDULING _SC_PRIORITY_SCHEDULING + _SC_TIMERS, +#define _SC_TIMERS _SC_TIMERS + _SC_ASYNCHRONOUS_IO, +#define _SC_ASYNCHRONOUS_IO _SC_ASYNCHRONOUS_IO + _SC_PRIORITIZED_IO, +#define _SC_PRIORITIZED_IO _SC_PRIORITIZED_IO + _SC_SYNCHRONIZED_IO, +#define _SC_SYNCHRONIZED_IO _SC_SYNCHRONIZED_IO + _SC_FSYNC, +#define _SC_FSYNC _SC_FSYNC + _SC_MAPPED_FILES, +#define _SC_MAPPED_FILES _SC_MAPPED_FILES + _SC_MEMLOCK, +#define _SC_MEMLOCK _SC_MEMLOCK + _SC_MEMLOCK_RANGE, +#define _SC_MEMLOCK_RANGE _SC_MEMLOCK_RANGE + _SC_MEMORY_PROTECTION, +#define _SC_MEMORY_PROTECTION _SC_MEMORY_PROTECTION + _SC_MESSAGE_PASSING, +#define _SC_MESSAGE_PASSING _SC_MESSAGE_PASSING + _SC_SEMAPHORES, +#define _SC_SEMAPHORES _SC_SEMAPHORES + _SC_SHARED_MEMORY_OBJECTS, +#define _SC_SHARED_MEMORY_OBJECTS _SC_SHARED_MEMORY_OBJECTS + _SC_AIO_LISTIO_MAX, +#define _SC_AIO_LISTIO_MAX _SC_AIO_LISTIO_MAX + _SC_AIO_MAX, +#define _SC_AIO_MAX _SC_AIO_MAX + _SC_AIO_PRIO_DELTA_MAX, +#define _SC_AIO_PRIO_DELTA_MAX _SC_AIO_PRIO_DELTA_MAX + _SC_DELAYTIMER_MAX, +#define _SC_DELAYTIMER_MAX _SC_DELAYTIMER_MAX + _SC_MQ_OPEN_MAX, +#define _SC_MQ_OPEN_MAX _SC_MQ_OPEN_MAX + _SC_MQ_PRIO_MAX, +#define _SC_MQ_PRIO_MAX _SC_MQ_PRIO_MAX + _SC_VERSION, +#define _SC_VERSION _SC_VERSION + _SC_PAGESIZE, +#define _SC_PAGESIZE _SC_PAGESIZE +#define _SC_PAGE_SIZE _SC_PAGESIZE + _SC_RTSIG_MAX, +#define _SC_RTSIG_MAX _SC_RTSIG_MAX + _SC_SEM_NSEMS_MAX, +#define _SC_SEM_NSEMS_MAX _SC_SEM_NSEMS_MAX + _SC_SEM_VALUE_MAX, +#define _SC_SEM_VALUE_MAX _SC_SEM_VALUE_MAX + _SC_SIGQUEUE_MAX, +#define _SC_SIGQUEUE_MAX _SC_SIGQUEUE_MAX + _SC_TIMER_MAX, +#define _SC_TIMER_MAX _SC_TIMER_MAX + + /* Values for the argument to `sysconf' + corresponding to _POSIX2_* symbols. */ + _SC_BC_BASE_MAX, +#define _SC_BC_BASE_MAX _SC_BC_BASE_MAX + _SC_BC_DIM_MAX, +#define _SC_BC_DIM_MAX _SC_BC_DIM_MAX + _SC_BC_SCALE_MAX, +#define _SC_BC_SCALE_MAX _SC_BC_SCALE_MAX + _SC_BC_STRING_MAX, +#define _SC_BC_STRING_MAX _SC_BC_STRING_MAX + _SC_COLL_WEIGHTS_MAX, +#define _SC_COLL_WEIGHTS_MAX _SC_COLL_WEIGHTS_MAX + _SC_EQUIV_CLASS_MAX, +#define _SC_EQUIV_CLASS_MAX _SC_EQUIV_CLASS_MAX + _SC_EXPR_NEST_MAX, +#define _SC_EXPR_NEST_MAX _SC_EXPR_NEST_MAX + _SC_LINE_MAX, +#define _SC_LINE_MAX _SC_LINE_MAX + _SC_RE_DUP_MAX, +#define _SC_RE_DUP_MAX _SC_RE_DUP_MAX + _SC_CHARCLASS_NAME_MAX, +#define _SC_CHARCLASS_NAME_MAX _SC_CHARCLASS_NAME_MAX + + _SC_2_VERSION, +#define _SC_2_VERSION _SC_2_VERSION + _SC_2_C_BIND, +#define _SC_2_C_BIND _SC_2_C_BIND + _SC_2_C_DEV, +#define _SC_2_C_DEV _SC_2_C_DEV + _SC_2_FORT_DEV, +#define _SC_2_FORT_DEV _SC_2_FORT_DEV + _SC_2_FORT_RUN, +#define _SC_2_FORT_RUN _SC_2_FORT_RUN + _SC_2_SW_DEV, +#define _SC_2_SW_DEV _SC_2_SW_DEV + _SC_2_LOCALEDEF, +#define _SC_2_LOCALEDEF _SC_2_LOCALEDEF + + _SC_PII, +#define _SC_PII _SC_PII + _SC_PII_XTI, +#define _SC_PII_XTI _SC_PII_XTI + _SC_PII_SOCKET, +#define _SC_PII_SOCKET _SC_PII_SOCKET + _SC_PII_INTERNET, +#define _SC_PII_INTERNET _SC_PII_INTERNET + _SC_PII_OSI, +#define _SC_PII_OSI _SC_PII_OSI + _SC_POLL, +#define _SC_POLL _SC_POLL + _SC_SELECT, +#define _SC_SELECT _SC_SELECT + _SC_UIO_MAXIOV, +#define _SC_UIO_MAXIOV _SC_UIO_MAXIOV + _SC_IOV_MAX = _SC_UIO_MAXIOV, +#define _SC_IOV_MAX _SC_IOV_MAX + _SC_PII_INTERNET_STREAM, +#define _SC_PII_INTERNET_STREAM _SC_PII_INTERNET_STREAM + _SC_PII_INTERNET_DGRAM, +#define _SC_PII_INTERNET_DGRAM _SC_PII_INTERNET_DGRAM + _SC_PII_OSI_COTS, +#define _SC_PII_OSI_COTS _SC_PII_OSI_COTS + _SC_PII_OSI_CLTS, +#define _SC_PII_OSI_CLTS _SC_PII_OSI_CLTS + _SC_PII_OSI_M, +#define _SC_PII_OSI_M _SC_PII_OSI_M + _SC_T_IOV_MAX, +#define _SC_T_IOV_MAX _SC_T_IOV_MAX + + /* Values according to POSIX 1003.1c (POSIX threads). */ + _SC_THREADS, +#define _SC_THREADS _SC_THREADS + _SC_THREAD_SAFE_FUNCTIONS, +#define _SC_THREAD_SAFE_FUNCTIONS _SC_THREAD_SAFE_FUNCTIONS + _SC_GETGR_R_SIZE_MAX, +#define _SC_GETGR_R_SIZE_MAX _SC_GETGR_R_SIZE_MAX + _SC_GETPW_R_SIZE_MAX, +#define _SC_GETPW_R_SIZE_MAX _SC_GETPW_R_SIZE_MAX + _SC_LOGIN_NAME_MAX, +#define _SC_LOGIN_NAME_MAX _SC_LOGIN_NAME_MAX + _SC_TTY_NAME_MAX, +#define _SC_TTY_NAME_MAX _SC_TTY_NAME_MAX + _SC_THREAD_DESTRUCTOR_ITERATIONS, +#define _SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS + _SC_THREAD_KEYS_MAX, +#define _SC_THREAD_KEYS_MAX _SC_THREAD_KEYS_MAX + _SC_THREAD_STACK_MIN, +#define _SC_THREAD_STACK_MIN _SC_THREAD_STACK_MIN + _SC_THREAD_THREADS_MAX, +#define _SC_THREAD_THREADS_MAX _SC_THREAD_THREADS_MAX + _SC_THREAD_ATTR_STACKADDR, +#define _SC_THREAD_ATTR_STACKADDR _SC_THREAD_ATTR_STACKADDR + _SC_THREAD_ATTR_STACKSIZE, +#define _SC_THREAD_ATTR_STACKSIZE _SC_THREAD_ATTR_STACKSIZE + _SC_THREAD_PRIORITY_SCHEDULING, +#define _SC_THREAD_PRIORITY_SCHEDULING _SC_THREAD_PRIORITY_SCHEDULING + _SC_THREAD_PRIO_INHERIT, +#define _SC_THREAD_PRIO_INHERIT _SC_THREAD_PRIO_INHERIT + _SC_THREAD_PRIO_PROTECT, +#define _SC_THREAD_PRIO_PROTECT _SC_THREAD_PRIO_PROTECT + _SC_THREAD_PROCESS_SHARED, +#define _SC_THREAD_PROCESS_SHARED _SC_THREAD_PROCESS_SHARED + + _SC_NPROCESSORS_CONF, +#define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_CONF + _SC_NPROCESSORS_ONLN, +#define _SC_NPROCESSORS_ONLN _SC_NPROCESSORS_ONLN + _SC_PHYS_PAGES, +#define _SC_PHYS_PAGES _SC_PHYS_PAGES + _SC_AVPHYS_PAGES, +#define _SC_AVPHYS_PAGES _SC_AVPHYS_PAGES + _SC_ATEXIT_MAX, +#define _SC_ATEXIT_MAX _SC_ATEXIT_MAX + _SC_PASS_MAX, +#define _SC_PASS_MAX _SC_PASS_MAX + + _SC_XOPEN_VERSION, +#define _SC_XOPEN_VERSION _SC_XOPEN_VERSION + _SC_XOPEN_XCU_VERSION, +#define _SC_XOPEN_XCU_VERSION _SC_XOPEN_XCU_VERSION + _SC_XOPEN_UNIX, +#define _SC_XOPEN_UNIX _SC_XOPEN_UNIX + _SC_XOPEN_CRYPT, +#define _SC_XOPEN_CRYPT _SC_XOPEN_CRYPT + _SC_XOPEN_ENH_I18N, +#define _SC_XOPEN_ENH_I18N _SC_XOPEN_ENH_I18N + _SC_XOPEN_SHM, +#define _SC_XOPEN_SHM _SC_XOPEN_SHM + + _SC_2_CHAR_TERM, +#define _SC_2_CHAR_TERM _SC_2_CHAR_TERM + _SC_2_C_VERSION, +#define _SC_2_C_VERSION _SC_2_C_VERSION + _SC_2_UPE, +#define _SC_2_UPE _SC_2_UPE + + _SC_XOPEN_XPG2, +#define _SC_XOPEN_XPG2 _SC_XOPEN_XPG2 + _SC_XOPEN_XPG3, +#define _SC_XOPEN_XPG3 _SC_XOPEN_XPG3 + _SC_XOPEN_XPG4, +#define _SC_XOPEN_XPG4 _SC_XOPEN_XPG4 + + _SC_CHAR_BIT, +#define _SC_CHAR_BIT _SC_CHAR_BIT + _SC_CHAR_MAX, +#define _SC_CHAR_MAX _SC_CHAR_MAX + _SC_CHAR_MIN, +#define _SC_CHAR_MIN _SC_CHAR_MIN + _SC_INT_MAX, +#define _SC_INT_MAX _SC_INT_MAX + _SC_INT_MIN, +#define _SC_INT_MIN _SC_INT_MIN + _SC_LONG_BIT, +#define _SC_LONG_BIT _SC_LONG_BIT + _SC_WORD_BIT, +#define _SC_WORD_BIT _SC_WORD_BIT + _SC_MB_LEN_MAX, +#define _SC_MB_LEN_MAX _SC_MB_LEN_MAX + _SC_NZERO, +#define _SC_NZERO _SC_NZERO + _SC_SSIZE_MAX, +#define _SC_SSIZE_MAX _SC_SSIZE_MAX + _SC_SCHAR_MAX, +#define _SC_SCHAR_MAX _SC_SCHAR_MAX + _SC_SCHAR_MIN, +#define _SC_SCHAR_MIN _SC_SCHAR_MIN + _SC_SHRT_MAX, +#define _SC_SHRT_MAX _SC_SHRT_MAX + _SC_SHRT_MIN, +#define _SC_SHRT_MIN _SC_SHRT_MIN + _SC_UCHAR_MAX, +#define _SC_UCHAR_MAX _SC_UCHAR_MAX + _SC_UINT_MAX, +#define _SC_UINT_MAX _SC_UINT_MAX + _SC_ULONG_MAX, +#define _SC_ULONG_MAX _SC_ULONG_MAX + _SC_USHRT_MAX, +#define _SC_USHRT_MAX _SC_USHRT_MAX + + _SC_NL_ARGMAX, +#define _SC_NL_ARGMAX _SC_NL_ARGMAX + _SC_NL_LANGMAX, +#define _SC_NL_LANGMAX _SC_NL_LANGMAX + _SC_NL_MSGMAX, +#define _SC_NL_MSGMAX _SC_NL_MSGMAX + _SC_NL_NMAX, +#define _SC_NL_NMAX _SC_NL_NMAX + _SC_NL_SETMAX, +#define _SC_NL_SETMAX _SC_NL_SETMAX + _SC_NL_TEXTMAX, +#define _SC_NL_TEXTMAX _SC_NL_TEXTMAX + + _SC_XBS5_ILP32_OFF32, +#define _SC_XBS5_ILP32_OFF32 _SC_XBS5_ILP32_OFF32 + _SC_XBS5_ILP32_OFFBIG, +#define _SC_XBS5_ILP32_OFFBIG _SC_XBS5_ILP32_OFFBIG + _SC_XBS5_LP64_OFF64, +#define _SC_XBS5_LP64_OFF64 _SC_XBS5_LP64_OFF64 + _SC_XBS5_LPBIG_OFFBIG, +#define _SC_XBS5_LPBIG_OFFBIG _SC_XBS5_LPBIG_OFFBIG + + _SC_XOPEN_LEGACY, +#define _SC_XOPEN_LEGACY _SC_XOPEN_LEGACY + _SC_XOPEN_REALTIME, +#define _SC_XOPEN_REALTIME _SC_XOPEN_REALTIME + _SC_XOPEN_REALTIME_THREADS, +#define _SC_XOPEN_REALTIME_THREADS _SC_XOPEN_REALTIME_THREADS + + _SC_ADVISORY_INFO, +#define _SC_ADVISORY_INFO _SC_ADVISORY_INFO + _SC_BARRIERS, +#define _SC_BARRIERS _SC_BARRIERS + _SC_BASE, +#define _SC_BASE _SC_BASE + _SC_C_LANG_SUPPORT, +#define _SC_C_LANG_SUPPORT _SC_C_LANG_SUPPORT + _SC_C_LANG_SUPPORT_R, +#define _SC_C_LANG_SUPPORT_R _SC_C_LANG_SUPPORT_R + _SC_CLOCK_SELECTION, +#define _SC_CLOCK_SELECTION _SC_CLOCK_SELECTION + _SC_CPUTIME, +#define _SC_CPUTIME _SC_CPUTIME + _SC_THREAD_CPUTIME, +#define _SC_THREAD_CPUTIME _SC_THREAD_CPUTIME + _SC_DEVICE_IO, +#define _SC_DEVICE_IO _SC_DEVICE_IO + _SC_DEVICE_SPECIFIC, +#define _SC_DEVICE_SPECIFIC _SC_DEVICE_SPECIFIC + _SC_DEVICE_SPECIFIC_R, +#define _SC_DEVICE_SPECIFIC_R _SC_DEVICE_SPECIFIC_R + _SC_FD_MGMT, +#define _SC_FD_MGMT _SC_FD_MGMT + _SC_FIFO, +#define _SC_FIFO _SC_FIFO + _SC_PIPE, +#define _SC_PIPE _SC_PIPE + _SC_FILE_ATTRIBUTES, +#define _SC_FILE_ATTRIBUTES _SC_FILE_ATTRIBUTES + _SC_FILE_LOCKING, +#define _SC_FILE_LOCKING _SC_FILE_LOCKING + _SC_FILE_SYSTEM, +#define _SC_FILE_SYSTEM _SC_FILE_SYSTEM + _SC_MONOTONIC_CLOCK, +#define _SC_MONOTONIC_CLOCK _SC_MONOTONIC_CLOCK + _SC_MULTI_PROCESS, +#define _SC_MULTI_PROCESS _SC_MULTI_PROCESS + _SC_SINGLE_PROCESS, +#define _SC_SINGLE_PROCESS _SC_SINGLE_PROCESS + _SC_NETWORKING, +#define _SC_NETWORKING _SC_NETWORKING + _SC_READER_WRITER_LOCKS, +#define _SC_READER_WRITER_LOCKS _SC_READER_WRITER_LOCKS + _SC_SPIN_LOCKS, +#define _SC_SPIN_LOCKS _SC_SPIN_LOCKS + _SC_REGEXP, +#define _SC_REGEXP _SC_REGEXP + _SC_REGEX_VERSION, +#define _SC_REGEX_VERSION _SC_REGEX_VERSION + _SC_SHELL, +#define _SC_SHELL _SC_SHELL + _SC_SIGNALS, +#define _SC_SIGNALS _SC_SIGNALS + _SC_SPAWN, +#define _SC_SPAWN _SC_SPAWN + _SC_SPORADIC_SERVER, +#define _SC_SPORADIC_SERVER _SC_SPORADIC_SERVER + _SC_THREAD_SPORADIC_SERVER, +#define _SC_THREAD_SPORADIC_SERVER _SC_THREAD_SPORADIC_SERVER + _SC_SYSTEM_DATABASE, +#define _SC_SYSTEM_DATABASE _SC_SYSTEM_DATABASE + _SC_SYSTEM_DATABASE_R, +#define _SC_SYSTEM_DATABASE_R _SC_SYSTEM_DATABASE_R + _SC_TIMEOUTS, +#define _SC_TIMEOUTS _SC_TIMEOUTS + _SC_TYPED_MEMORY_OBJECTS, +#define _SC_TYPED_MEMORY_OBJECTS _SC_TYPED_MEMORY_OBJECTS + _SC_USER_GROUPS, +#define _SC_USER_GROUPS _SC_USER_GROUPS + _SC_USER_GROUPS_R, +#define _SC_USER_GROUPS_R _SC_USER_GROUPS_R + _SC_2_PBS, +#define _SC_2_PBS _SC_2_PBS + _SC_2_PBS_ACCOUNTING, +#define _SC_2_PBS_ACCOUNTING _SC_2_PBS_ACCOUNTING + _SC_2_PBS_LOCATE, +#define _SC_2_PBS_LOCATE _SC_2_PBS_LOCATE + _SC_2_PBS_MESSAGE, +#define _SC_2_PBS_MESSAGE _SC_2_PBS_MESSAGE + _SC_2_PBS_TRACK, +#define _SC_2_PBS_TRACK _SC_2_PBS_TRACK + _SC_SYMLOOP_MAX, +#define _SC_SYMLOOP_MAX _SC_SYMLOOP_MAX + _SC_STREAMS, +#define _SC_STREAMS _SC_STREAMS + _SC_2_PBS_CHECKPOINT, +#define _SC_2_PBS_CHECKPOINT _SC_2_PBS_CHECKPOINT + + _SC_V6_ILP32_OFF32, +#define _SC_V6_ILP32_OFF32 _SC_V6_ILP32_OFF32 + _SC_V6_ILP32_OFFBIG, +#define _SC_V6_ILP32_OFFBIG _SC_V6_ILP32_OFFBIG + _SC_V6_LP64_OFF64, +#define _SC_V6_LP64_OFF64 _SC_V6_LP64_OFF64 + _SC_V6_LPBIG_OFFBIG, +#define _SC_V6_LPBIG_OFFBIG _SC_V6_LPBIG_OFFBIG + + _SC_HOST_NAME_MAX, +#define _SC_HOST_NAME_MAX _SC_HOST_NAME_MAX + _SC_TRACE, +#define _SC_TRACE _SC_TRACE + _SC_TRACE_EVENT_FILTER, +#define _SC_TRACE_EVENT_FILTER _SC_TRACE_EVENT_FILTER + _SC_TRACE_INHERIT, +#define _SC_TRACE_INHERIT _SC_TRACE_INHERIT + _SC_TRACE_LOG, +#define _SC_TRACE_LOG _SC_TRACE_LOG + + _SC_LEVEL1_ICACHE_SIZE, +#define _SC_LEVEL1_ICACHE_SIZE _SC_LEVEL1_ICACHE_SIZE + _SC_LEVEL1_ICACHE_ASSOC, +#define _SC_LEVEL1_ICACHE_ASSOC _SC_LEVEL1_ICACHE_ASSOC + _SC_LEVEL1_ICACHE_LINESIZE, +#define _SC_LEVEL1_ICACHE_LINESIZE _SC_LEVEL1_ICACHE_LINESIZE + _SC_LEVEL1_DCACHE_SIZE, +#define _SC_LEVEL1_DCACHE_SIZE _SC_LEVEL1_DCACHE_SIZE + _SC_LEVEL1_DCACHE_ASSOC, +#define _SC_LEVEL1_DCACHE_ASSOC _SC_LEVEL1_DCACHE_ASSOC + _SC_LEVEL1_DCACHE_LINESIZE, +#define _SC_LEVEL1_DCACHE_LINESIZE _SC_LEVEL1_DCACHE_LINESIZE + _SC_LEVEL2_CACHE_SIZE, +#define _SC_LEVEL2_CACHE_SIZE _SC_LEVEL2_CACHE_SIZE + _SC_LEVEL2_CACHE_ASSOC, +#define _SC_LEVEL2_CACHE_ASSOC _SC_LEVEL2_CACHE_ASSOC + _SC_LEVEL2_CACHE_LINESIZE, +#define _SC_LEVEL2_CACHE_LINESIZE _SC_LEVEL2_CACHE_LINESIZE + _SC_LEVEL3_CACHE_SIZE, +#define _SC_LEVEL3_CACHE_SIZE _SC_LEVEL3_CACHE_SIZE + _SC_LEVEL3_CACHE_ASSOC, +#define _SC_LEVEL3_CACHE_ASSOC _SC_LEVEL3_CACHE_ASSOC + _SC_LEVEL3_CACHE_LINESIZE, +#define _SC_LEVEL3_CACHE_LINESIZE _SC_LEVEL3_CACHE_LINESIZE + _SC_LEVEL4_CACHE_SIZE, +#define _SC_LEVEL4_CACHE_SIZE _SC_LEVEL4_CACHE_SIZE + _SC_LEVEL4_CACHE_ASSOC, +#define _SC_LEVEL4_CACHE_ASSOC _SC_LEVEL4_CACHE_ASSOC + _SC_LEVEL4_CACHE_LINESIZE, +#define _SC_LEVEL4_CACHE_LINESIZE _SC_LEVEL4_CACHE_LINESIZE + /* Leave room here, maybe we need a few more cache levels some day. */ + + _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50, +#define _SC_IPV6 _SC_IPV6 + _SC_RAW_SOCKETS, +#define _SC_RAW_SOCKETS _SC_RAW_SOCKETS + + _SC_V7_ILP32_OFF32, +#define _SC_V7_ILP32_OFF32 _SC_V7_ILP32_OFF32 + _SC_V7_ILP32_OFFBIG, +#define _SC_V7_ILP32_OFFBIG _SC_V7_ILP32_OFFBIG + _SC_V7_LP64_OFF64, +#define _SC_V7_LP64_OFF64 _SC_V7_LP64_OFF64 + _SC_V7_LPBIG_OFFBIG, +#define _SC_V7_LPBIG_OFFBIG _SC_V7_LPBIG_OFFBIG + + _SC_SS_REPL_MAX, +#define _SC_SS_REPL_MAX _SC_SS_REPL_MAX + + _SC_TRACE_EVENT_NAME_MAX, +#define _SC_TRACE_EVENT_NAME_MAX _SC_TRACE_EVENT_NAME_MAX + _SC_TRACE_NAME_MAX, +#define _SC_TRACE_NAME_MAX _SC_TRACE_NAME_MAX + _SC_TRACE_SYS_MAX, +#define _SC_TRACE_SYS_MAX _SC_TRACE_SYS_MAX + _SC_TRACE_USER_EVENT_MAX, +#define _SC_TRACE_USER_EVENT_MAX _SC_TRACE_USER_EVENT_MAX + + _SC_XOPEN_STREAMS, +#define _SC_XOPEN_STREAMS _SC_XOPEN_STREAMS + + _SC_THREAD_ROBUST_PRIO_INHERIT, +#define _SC_THREAD_ROBUST_PRIO_INHERIT _SC_THREAD_ROBUST_PRIO_INHERIT + _SC_THREAD_ROBUST_PRIO_PROTECT +#define _SC_THREAD_ROBUST_PRIO_PROTECT _SC_THREAD_ROBUST_PRIO_PROTECT + +}; +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/posix1_lim.h new file mode 100755 index 0000000000000..0739958c5a6c4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/posix1_lim.h @@ -0,0 +1,34 @@ +#ifndef POSIX1_LIM_H +#define POSIX1_LIM_H +/** + @file posix1_lim.h + @brief POSIX Minimum values + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +TODO + This header should be ideally relocated under api/posix/bits (something that + doesnt exist today) and be included from api/posix/bits/limits.h which inturn + should be included from toolchain's limits.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +#ifndef _POSIX_PATH_MAX +/** @brief Maximum number of bytes in a pathname, including the terminating + nul character */ +#define _POSIX_PATH_MAX 256 +#endif + +#ifndef _POSIX_SEM_NSEMS_MAX +/** @brief Maximum number of semaphores that a process may have */ +#define _POSIX_SEM_NSEMS_MAX 16 +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/common/time.h new file mode 100755 index 0000000000000..76b0d39ab7039 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/common/time.h @@ -0,0 +1 @@ +#include \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/fcntl.h new file mode 100755 index 0000000000000..c80ec98a449b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/fcntl.h @@ -0,0 +1,51 @@ +#ifndef _FCNTL_H +#define _FCNTL_H + +/*========================================================================== + * FILE: fcntl.h + * + * SERVICES: POSIX fcntl.h + * + * DESCRIPTION: The header is needed by the open() and fcntl() + * system calls, which have a variety of parameters and + * flags. They are described here. + * + * The formats of the calls to each of these are: + * + * open(path, oflag [,mode]) open a file + * fcntl(fd, cmd [,arg]) get or set file attributes + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Oflag values for open(). POSIX Table 6-4. */ +#define POSIX_O_CREAT 0x100 /* creat file if it doesn't exist */ +#define POSIX_O_EXCL 0x200 /* exclusive use flag */ +#define POSIX_O_NOCTTY 0x400 /* do not assign a controlling terminal */ +#define POSIX_O_TRUNC 0x1000 /* truncate flag */ + +/* File status flags for open() and fcntl(). POSIX Table 6-5. */ +#define POSIX_O_APPEND 0x2000 /* set append mode */ +#define POSIX_O_NONBLOCK 0x4000 /* no delay */ + +/* File access modes for open() and fcntl(). POSIX Table 6-6. */ +#define POSIX_O_RDONLY 0 /* open(name, POSIX_O_RDONLY) opens read only */ +#define POSIX_O_WRONLY 1 /* open(name, POSIX_O_WRONLY) opens write only */ +#define POSIX_O_RDWR 2 /* open(name, POSIX_O_RDWR) opens read/write */ + +/* Mask for use with file access modes. POSIX Table 6-7. */ +#define POSIX_O_ACCMODE 0x3 /* mask for file access modes */ + +#ifdef __cplusplus +} +#endif + +#endif /* _FCNTL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/hooks/unistd.h new file mode 100755 index 0000000000000..1c618bfe36b4f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/hooks/unistd.h @@ -0,0 +1,115 @@ +#ifndef UNISTD_H +#define UNISTD_H +/** + @file posix/hooks/unistd.h + @brief POSIX related declarations in that are missing in toolchain + header + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly! Instead include unistd.h. + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include /* For various POSIX ID types from toolchain headers */ + +#ifdef __cplusplus +extern "C" { +#endif +extern long pathconf (char const * path, int name); + +/* Process*/ + +/** The getppid() function shall return the parent process ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the parent process ID + */ +pid_t getppid(void); + +/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid + * Please refer to POSIX standard for details. + * @param thread [in] process ID + * @param value_ptr [out] process group ID + */ +pid_t getpgid(pid_t pid); + +/** The getpgrp() function shall return the process group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] process group ID of the calling process + */ +pid_t getpgrp(void); + +/**The getuid() function shall return the real user ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the real user ID of the calling process. + */ +uid_t getuid(void); + +/** The geteuid() function shall return the effective user ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective user ID of the calling process + */ +uid_t geteuid(void); + +/** The getegid() function shall return the effective group ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective group ID of the calling process. + */ +gid_t getegid(void); + +/** The getgid() function shall return the real group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] real group ID of the calling process. + */ + gid_t getgid(void); + +/** seteuid set effective user ID + * Please refer to POSIX standard for details. + * @param thread [in] effective user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int seteuid(uid_t uid); + +/** setpgrp - set the process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setpgrp(void); + +/** setuid - set user ID + * Please refer to POSIX standard for details. + * @param thread [in] user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setuid(uid_t uid); + +/** setpgid - set process group ID for job control + * Please refer to POSIX standard for details. + * @param thread [in] PID of process, PGID to be set + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setpgid(pid_t pid, pid_t pgid); + +/** setsid - create session and set process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setsid(void); + +#ifdef __cplusplus +} +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/mqueue.h new file mode 100755 index 0000000000000..74dcc2fa202c6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/mqueue.h @@ -0,0 +1,203 @@ +#ifndef _POSIX_MQUEUE_H_ +#define _POSIX_MQUEUE_H_ + +/*========================================================================== + * FILE: mqueue.h + * + * SERVICES: POSIX Message Queue API interface + * + * DESCRIPTION: POSIX Message Queue API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technlogies, Inc. + *==========================================================================*/ + +#include /*ssize_t */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MQ_PRIO_MAX 255 /* max priority */ +#define MQ_PRIO_DEFAULT 0 /* default priority */ + +typedef int mqd_t; + +struct mq_attr +{ + long mq_flags; /* message queue flags */ + long mq_maxmsg; /* maximum number of messages */ + long mq_msgsize; /* maximum message size */ + long mq_curmsgs; /* number of messages currently queued */ +}; + +typedef struct mq_attr mqueue_attr; + +/** \details + * This provides POSIX Message Queue API. + * + * mq_notify is not supported. + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * it only supports Message sending and receiving within one process. + * Message sending and receiving among processes are not supported. + */ + +/** \defgroup mqueue POSIX Message Queue API */ +/** \ingroup mqueue */ +/** @{ */ + +/** Open a message queue. + * Please refer to POSIX standard for details. + */ +mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...); + +/** Close a message queue. + * Please refer to POSIX standard for details. + */ +int mq_close(mqd_t mq_desc); + +/** Remove a message queue. + * Please refer to POSIX standard for details. + */ +int mq_unlink(const char *name); + +/** Send a message to a message queue. + * Please refer to POSIX standard for details. + * + * If the queue is full, instead of blocking the sender, this function + * will return -1 with errno EAGAIN, in this implementation. This behavior + * may change in the future. + */ +int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio); + +/** Send a message to a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); + +/** Receive a message from a message queue. + * Please refer to POSIX standard for details. + */ +ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio); + +/** Receive a message from a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout); + +/** Get message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat); + +/** Set message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat); + +/** @} */ + +#define NBBY 8U /* number of bits in a byte */ + +/* + * Select uses bit masks of file descriptors in longs. These macros + * manipulate such bit fields (the filesystem macros use chars). + * FD_SETSIZE may be defined by the user, but the default here should + * be enough for most uses. + */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 256U +#endif + +typedef unsigned long fd_mask; +#define NFDBITS (sizeof(fd_mask) * (unsigned int)NBBY) /* bits per mask */ + +#ifndef howmany +#define howmany(x, y) (((x) + ((y) - 1U)) / (y)) +#endif + +//equivalent of fd_set fpr WINNT env +typedef struct fd_set +{ + fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)]; +} fd_set; + +/** \addtogroup mqueue */ +/** @{ */ + +/** Sets the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Clears the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise. + */ +#define FD_ISSET(n, p) ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS))) + +/** Copies the file descriptor set. + */ +#define FD_COPY(f, t) (void)(memcpy)((t), (f), sizeof(*(f))) + +/** Initializes the file descriptor set fdset to have zero bits for all file descriptors. + */ +#define FD_ZERO(p) (void)memset((p), 0, sizeof(*(p))) + +/** Error check the file descriptor set. + */ +#define FD_BAD(fd) ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/) + +/*! Wait for both message queues and signals. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int pselect(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + const struct timespec *restrict timeout, + const sigset_t *restrict sigmask); + +/*! Wait for multiple message queues. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int select(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + struct timeval *restrict timeout); + +/** @} */ + +/* this function is needed for test framework which needs to clean up memory when teardown */ +void _mq_teardown(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread.h new file mode 100755 index 0000000000000..f64242e8dc683 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread.h @@ -0,0 +1,287 @@ +#ifndef QURT_PTHREAD_H +#define QURT_PTHREAD_H + +/*========================================================================== + * FILE: pthread.h + * + * SERVICES: POSIX pthread API interface + * + * DESCRIPTION: POSIX pthread API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016,2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *========================================================================== + * + * EDIT HISTORY FOR MODULE + * + * This section contains comments describing changes made to the module. + * Notice that changes are listed in reverse chronological order. + * + * + * + * when who what, where, why + * -------- --- ------------------------------------------------------- + * 10/13/08 cz Initial version. + *==========================================================================*/ + +#include +#include "sys/sched.h" /* For struct sched_param */ +#include "sys/errno.h" /* error values */ +#include +#include +#include +#include +#include +#include "pthread_types.h" +#ifdef __cplusplus +extern "C" { +#endif + +/* the range of the set supported by the kernel data type used to represent CPU sets. */ +#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL + +#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS) static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); } + +/** @brief Magic (non-portable) value for a stack's address to enable usage + of auto-stack feature (if available) */ +#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF) + +/** \details + * This provides POSIX thread API. + * + */ + +/** \defgroup pthread POSIX pthread API */ +/** \ingroup pthread */ +/** @{ */ + +/** Compare Two Threads. + * Please refer to POSIX standard for details. + */ +static inline int pthread_equal(pthread_t t1, pthread_t t2) +{ + return (t1 == t2) ? 1 : 0; +} + +/** Create Thread. + * Please refer to POSIX standard for details. + */ +int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg); + +/** Terminate Calling Thread. + * Please refer to POSIX standard for details. + */ +void pthread_exit(void *value_ptr); + +/** Wait for thread termination. + * Please refer to POSIX standard for details. + * @param thread [in] the thread to be joined + * @param value_ptr [out] the pointer of the exit status + */ +int pthread_join(pthread_t thread, void **value_ptr); + +/** Detach a joinable thread. + * Please refer to POSIX standard for details. + * @param id [in] id of the tread the thread to be detached. + */ +int pthread_detach(pthread_t id); + +/** Dynamic package initialisation + * Please refer to POSIX standard for details. + */ +int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)); + +pthread_t pthread_self(void); +int pthread_cancel(pthread_t thread); +static inline void pthread_yield(void) +{ + return; +} + +int pthread_kill(pthread_t thread, int sig); + +/** + * @brief Return name of thread + * @warning Donot call this in the error handling path as it may cause deadlock + * due to underlying OS calls + * @param thread [in] thread Thread whose name is to be retrieved + * @param name [out] name Buffer used to return thread name + * @param len [in] len Number of bytes available in name + * @return 0 on success, ESRCH, ERANGE on failure + */ +extern int pthread_getname_np (pthread_t thread, char * name, size_t len); + +int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param); +int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param); +int pthread_setschedprio(pthread_t thread, int prio); +int pthread_setcancelstate(int state, int *oldstate); +int pthread_setcanceltype(int type, int *oldtype); + +/* Attribute functions */ +int pthread_attr_init(pthread_attr_t *attr); +int pthread_attr_destroy(pthread_attr_t *attr); +int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param); +int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param); +int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize); +int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize); +int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr); +int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr); +int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate); +int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate); +int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize); +int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize); +int pthread_attr_setscope(pthread_attr_t *attr, int scope); +int pthread_attr_getscope(const pthread_attr_t *attr, int *scope); +int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched); +int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched); +int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize); +int pthread_attr_setautostack(pthread_attr_t *attr); +int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority); + +/* Qualcomm additions to pthread get/set attribute functions */ +int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name); +int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size); +int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid); +int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid); + +/* Mutexes */ +int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr); +int pthread_mutex_lock(pthread_mutex_t *mutex); +int pthread_mutex_unlock(pthread_mutex_t *mutex); +int pthread_mutex_trylock(pthread_mutex_t *mutex); +int pthread_mutex_destroy(pthread_mutex_t *mutex); +int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling); +int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling); + +/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not + * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support + * this kind of Mutex */ +int pthread_mutexattr_init(pthread_mutexattr_t *attr); +int pthread_mutexattr_destroy(pthread_mutexattr_t *attr); +int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type); +int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol); +int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int); +int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling); +int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling); + +/* Spinlocks */ +int pthread_spin_init(pthread_spinlock_t *lock, int pshared); +int pthread_spin_destroy(pthread_spinlock_t *lock); +int pthread_spin_lock(pthread_spinlock_t *lock); +int pthread_spin_trylock(pthread_spinlock_t *lock); +int pthread_spin_unlock(pthread_spinlock_t *lock); + +/* Condition variables */ +int pthread_condattr_init(pthread_condattr_t *attr); +int pthread_condattr_destroy(pthread_condattr_t *attr); +int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared); +int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared); +int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock); +int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock); +int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr); +int pthread_cond_destroy(pthread_cond_t *cond); +int pthread_cond_signal(pthread_cond_t *cond); +int pthread_cond_broadcast(pthread_cond_t *cond); +int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); +int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time); + +/* Barriers */ +int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count); +int pthread_barrier_destroy(pthread_barrier_t *barrier); +int pthread_barrier_wait(pthread_barrier_t *barrier); +int pthread_barrierattr_init(pthread_barrierattr_t *attr); +int pthread_barrierattr_destroy(pthread_barrierattr_t *attr); +int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared); + + +/*Read-Write locks*/ +int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *); +int pthread_rwlock_destroy(pthread_rwlock_t *); +int pthread_rwlockattr_init(pthread_rwlockattr_t *); +int pthread_rwlockattr_destroy(pthread_rwlockattr_t *); +int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *); +int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int); +int pthread_rwlock_rdlock(pthread_rwlock_t *); +int pthread_rwlock_tryrdlock(pthread_rwlock_t *); +int pthread_rwlock_wrlock(pthread_rwlock_t *); +int pthread_rwlock_trywrlock(pthread_rwlock_t *); +int pthread_rwlock_unlock(pthread_rwlock_t *); + + +/** please refer to POSIX standard document + */ +int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared); + +/** set CPU affinity attribute in thread attributes object. + + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [in] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpuset specified a CPU that was outside the set supported + by the kernel. (The kernel configuration option + CONFIG_NR_CPUS defines the range of the set supported by + the kernel data type used to represent CPU sets.) + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset); + +/** get CPU affinity attribute in thread attributes object. + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [out] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpusetsize is smaller than the size of the affinity mask + used by the kernel. + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset); + +/* TLS */ +int pthread_key_create(pthread_key_t *key, void (*destructor)(void*)); +int pthread_key_delete(pthread_key_t key); +int pthread_setspecific(pthread_key_t key, const void *value); +void *pthread_getspecific(pthread_key_t key); +int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); + +/** @} */ + +/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */ +int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr); +int pthread_fake_destroy(pthread_t thread); + +//amitkulk: move these to unistd.h after we move that header within qurt +int posix_memalign(void **memptr, size_t alignment, size_t size); +void exit(int status); +#ifdef __cplusplus +} +#endif + +#endif /* QURT_PTHREAD_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread_types.h new file mode 100755 index 0000000000000..51c3b9dbca243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread_types.h @@ -0,0 +1,193 @@ +#ifndef _PTHREAD_TYPES_H_ +#define _PTHREAD_TYPES_H_ + +/*========================================================================== + * FILE: pthread_types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2016, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __GNUC__ +#define restrict __restrict__ +#else +#define restrict +#endif + +#define _SSIZE_T + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#define PTHREAD_MAX_THREADS 512U + +#define PTHREAD_NAME_LEN 16 +#define PTHREAD_MIN_STACKSIZE 512 //4096 +#define PTHREAD_MAX_STACKSIZE 1048576 +#define PTHREAD_DEFAULT_STACKSIZE 16384 + +#define PTHREAD_STACK_MIN (4096U*2U) +#define PTHREAD_MIN_PRIORITY 0U +#define PTHREAD_MAX_PRIORITY 255U +#define PTHREAD_DEFAULT_PRIORITY 1 + +/*Mutex initialization status*/ +#define PTHREAD_MUTEX_ATTR_UNINITIALIZED 0 +#define PTHREAD_MUTEX_ATTR_INITIALIZED 1 + +/*Conditional attributes initialization status*/ +#define PTHREAD_COND_ATTR_UNINITIALIZED 0 +#define PTHREAD_COND_ATTR_INITIALIZED 1 + +#define PTHREAD_DEFAULT_NAME "Anonymous" + +#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t) 0xFFFFFFFFU) + +#define PTHREAD_COND_INITIALIZER ((pthread_cond_t) 0xFFFFFFFFU) + +/* mutex and cond_var shared */ +#define PTHREAD_PROCESS_PRIVATE 0 +#define PTHREAD_PROCESS_SHARED 1 + +/* mutex type */ +#define PTHREAD_MUTEX_ERRORCHECK 0 +#define PTHREAD_MUTEX_NORMAL 1 +#define PTHREAD_MUTEX_RECURSIVE 2 +#define PTHREAD_MUTEX_DEFAULT 3 + +/* mutex protocol */ +#define PTHREAD_PRIO_NONE 0 +#define PTHREAD_PRIO_INHERIT 1 +#define PTHREAD_PRIO_PROTECT 2 + +#define PTHREAD_SPINLOCK_UNLOCKED 0 +#define PTHREAD_SPINLOCK_LOCKED 1 + +#define PTHREAD_ONCE_INIT (0) + +#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug + +typedef signed int ssize_t; + +/*detatchstate of a pthread*/ +#define PTHREAD_CREATE_JOINABLE 1 +#define PTHREAD_CREATE_DETACHED 0 + +/*contention scope*/ +#define PTHREAD_SCOPE_PROCESS 1 +#define PTHREAD_SCOPE_SYSTEM 0 + +/*scheduler*/ +#define PTHREAD_INHERIT_SCHED 1 +#define PTHREAD_EXPLICIT_SCHED 0 + +/* + * Types and structure definitions + * + */ +typedef unsigned int cpu_set_t; + +typedef unsigned int pthread_t; + +typedef struct pthread_attr_t +{ + void *stackaddr; + int internal_stack; /* this flag==1 means the stack needs to be freed by posix */ + size_t stacksize; + int priority; + unsigned short timetest_id; + /* This flag indicate if thread will be autostack thread*/ + unsigned short autostack:1; + /* This flag is to indicate thread's bus_priority high/low + bus_priority = 0 -- Bus_priority is low + bus_priority = 1 -- Bus_priority is high + bus_priority = 3 -- Bus_priority is default (takes the default set for the process) + */ + unsigned short bus_priority:2; + unsigned short reserved:13; + cpu_set_t cpumask; + char name[PTHREAD_NAME_LEN]; + /* This flag indicates whether pthread lib should create thread contexts for other OSALs */ + /* This is used internally by POSIX and not available for general usage */ + int ext_context; + int detachstate; +} pthread_attr_t; + +//mutex attr +typedef struct pthread_mutexattr_t pthread_mutexattr_t; +struct pthread_mutexattr_t +{ + int is_initialized; + int type; + int pshared; + int protocol; +}; + +typedef unsigned int pthread_mutex_t; + +typedef unsigned int pthread_spinlock_t; + +typedef struct pthread_condattr_t +{ + int is_initialized; + int pshared; + clockid_t clock_id; +} pthread_condattr_t; + +typedef unsigned int pthread_cond_t; + +typedef struct pthread_barrierattr_t +{ + int is_initialized; + int pshared; +} pthread_barrierattr_t; + +typedef unsigned int pthread_barrier_t; + +typedef int pthread_key_t; + +typedef int pthread_once_t; + + +/*Read-Write locks*/ +#define PTW32_RWLOCK_MAGIC 0xfacade2 +#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1) + +struct pthread_rwlockattr_t_ +{ + int pshared; +}; + +struct pthread_rwlock_t_ +{ + pthread_mutex_t mtxExclusiveAccess; + pthread_mutex_t mtxSharedAccessCompleted; + pthread_cond_t cndSharedAccessCompleted; + int nSharedAccessCount; + int nExclusiveAccessCount; + int nCompletedSharedAccessCount; + int nMagic; +}; + +typedef struct pthread_rwlock_t_ * pthread_rwlock_t; +typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t; +#ifdef __cplusplus +} +#endif + +#endif /* _PTHERAD_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sched.h new file mode 100755 index 0000000000000..faf3365be9f82 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sched.h @@ -0,0 +1,21 @@ +/*============================================================================= + + sched.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SCHED_H__ +#define __SCHED_H__ + +#include "sys/sched.h" + +#endif //__SCHED_H__ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/semaphore.h new file mode 100755 index 0000000000000..d9145b295ae62 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/semaphore.h @@ -0,0 +1,114 @@ +#ifndef SEMAPHORE_H +#define SEMAPHORE_H + +/*========================================================================== + * FILE: semaphore.h + * + * SERVICES: POSIX semaphore API interface + * + * DESCRIPTION: POSIX semaphore API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ +#include // Get all C sys types - includes POSIX specific +#include "sys/errno.h" // error values + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** User facing semaphore container with opaque pointer to implementation */ +typedef struct +{ + unsigned int *opaque; +} sem_t; +#define _SEM_T + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* constant definitions */ +#define SEM_FAILED ((sem_t*) 0) + +/* @todo siqbal Should we put such configuration items in a common place + instead of this user-facing header? */ +#define SEM_VALUE_MAX ((unsigned int) 30) // If need be increase this + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/** \details + * POSIX standard comes with two kinds of semaphores: named and unnamed + * semaphores. + * + * This implementation of POSIX kernel API provide unnamed & named semaphore. + * + * + * sem_timedwait() is not provided. + */ + +/** \defgroup semaphore POSIX Semaphore API */ + +/** \ingroup semaphore */ +/** @{ */ + +/** Initialize an unnamed semaphore. + * Please refer to POSIX standard for details. + * @param pshared [in] This implementation does not support non-zero value, + * i.e., semaphore cannot be shared between processes in this implementation. + */ +int sem_init(sem_t *sem, int pshared, unsigned int value); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_wait(sem_t *sem); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_trywait(sem_t *sem); + +/** Unlock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_post(sem_t *sem); + +/** Get the value of a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_getvalue(sem_t *sem, int *value); + +/** Destroy an unnamed semaphore. + * Please refer to POSIX standard for details. + */ +int sem_destroy(sem_t *sem); + +/** creates and initializes a named semaphore. + * Please refer to POSIX standard for details. + */ +sem_t * sem_open(const char* name , int oflag , ...); + +/** closes a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_close(sem_t *sem); + +/** unlinkes a named semaphore. + * Please refer to POSIX standard for details. + */ +int sem_unlink(const char *name); +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* SEMAPHORE_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/signal.h new file mode 100755 index 0000000000000..35cb1f1a9a319 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/signal.h @@ -0,0 +1,201 @@ +#ifndef _SIGNAL_H_ +#define _SIGNAL_H_ + +/*========================================================================== + * FILE: signal.h + * + * SERVICES: POSIX Signal API interface + * + * DESCRIPTION: POSIX Signal API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* POSIX signal bits */ + +#define POSIX_MSG 7 /* POSIX msg type used in Qube API */ +#define POSIX_NOTIF 8 /* POSIX msg type used in Qube API */ +#define SIGKILL 9 /* kill (cannot be caught or ignored) */ + +#define SIGRTMIN 10 +#define SIGRTMAX 32 + +/* Notification Types. */ +/* No asynchronous notification is delivered when the event of interest occurs. */ +#define SIGEV_NONE 0 +/* The signal specified in sigev_signo shall be generated for the process when + the event of interest occurs. */ +#define SIGEV_SIGNAL 1 +/* A notification function is called to perform notification. */ +#define SIGEV_THREAD 2 +#define SA_SIGINFO 1 + +/* + * Flags for sigprocmask: + */ +#define SIG_BLOCK 1 /* block specified signal set */ +#define SIG_UNBLOCK 2 /* unblock specified signal set */ +#define SIG_SETMASK 3 /* set specified signal set */ + +typedef unsigned long int sigset_t; + +union sigval +{ + int sival_int; /* Integer signal value. */ + void *sival_ptr; /* Pointer signal value. */ +}; + +typedef struct sigevent sigevent; +struct sigevent +{ + int sigev_notify; /* Notification type. */ + int sigev_signo; /* Signal number. */ + union sigval sigev_value; /* Signal value. */ + void (*sigev_notify_function)(union sigval); /* Notification function. */ + pthread_attr_t *sigev_notify_attributes; +}; + +typedef struct siginfo_t siginfo_t; +struct siginfo_t +{ + int si_signo; + int si_code; + union sigval si_value; +/* int si_errno; + pid_t si_pid; + uid_t si_uid; + void *si_addr; + int si_status; + long si_band;*/ +}; +struct sigaction +{ + void (*sa_handler)(int); + sigset_t sa_mask; + int sa_flags; + void (*sa_sigaction)(int, siginfo_t *, void *); +}; + +/* Signal functions */ + +/** \details + * This provides POSIX Signal API. Please note that this + * implementation does not fully comply with POSIX standard. + * + * In POSIX standard, Signal can be used as 'interrupt', which means + * an incoming signal will interrupt a running thread. After the + * registered signal handler is executed, the thread will resume. + * This behavior cannot be implemented w/o modifying L4 or QURT kernel. + * On the ohter hand, appliation need to be carefully written to avoid + * problems caused by 'interrupting' signals. + * + * Therefore, in this implementation of POSIX signal, thread will + * only receive signals when it explicitly waits for signals, i.e., when + * the thread calls either sigwait() or sigsuspend(). + * + * Therefore, pthread_sigmask(), which set or get signal mask for a thread, + * is not supported, since the signal mask will be set by sigwait() and + * sigsuspend(). + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * only threads can send and receive signals. The functions related to + * signal operations with processes, such as kill(), sigqueue(), + * sigprocmask(), are not provided. + * + * Queued signal is not supported. + * + * Applications will use signals from SIGRTMIN to SIGRTMAX. + * + * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not + * supported. + * + */ + +/** \defgroup signal POSIX Signal API */ +/** \ingroup signal */ +/** @{ */ + +/** Wait for signals. This implementation does not support queued signals. + * + * Please refer to POSIX standard for details. + */ +int sigwait(const sigset_t *restrict set, int *restrict sig); + +/** Examine and Change Signal Action. + * Please refer to POSIX standard for details. + * + * @param act [in] A pointer to the sigaction structure that describes the + * action to be taken for the signal. Can be NULL. + * The following flags for sa_flags field in struct sigaction are not + * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, + * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported. + * + * @note Define sigaction as macro to avoid a warning when included from + * C++ code - it's causing a "sigaction(...) hides constructor for + * 'struct sigaction'" warning. + */ +/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */ +#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact)) + +/** Wait for signals. + * Please refer to POSIX standard for details. + */ +int sigsuspend(const sigset_t *sigmask); + +/** Add Signal to Signal Set. + * Please refer to POSIX standard for details. + */ +int sigaddset(sigset_t *set, int signo); + +/** Delete Signal from Signal Set. + * Please refer to POSIX standard for details. + */ +int sigdelset(sigset_t *set, int signo); + +/** Initialize and Empty Signal Set. + * Please refer to POSIX standard for details. + */ +int sigemptyset(sigset_t *set); + +/** Initialize and Fill Signal Set. + * Please refer to POSIX standard for details. + */ +int sigfillset(sigset_t *set); + +/** Test for Signal in Signal Set. + * Please refer to POSIX standard for details. + */ +int sigismember(const sigset_t *set, int signo); + +/** @} */ + +/* this is not a public api function */ +int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact); + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +/** Wait for the time interval specified in the timespec structure referenced + * by timeout. This implementation does not support queued signals. + * For struct siginfo_t, si_code and si_value are ignored in this implementation. + * + * Please refer to POSIX standard for details. + */ +int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, + const struct timespec *restrict timeout); + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SIGNAL_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/errno.h new file mode 100755 index 0000000000000..b9edf57bab6c3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/errno.h @@ -0,0 +1,20 @@ +#ifndef _SYS_ERRNO_H_ +#define _SYS_ERRNO_H_ + +/*========================================================================== + * FILE: errno.h + * + * SERVICES: POSIX errno header file + * + * DESCRIPTION: POSIX errno based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#ifndef EOK +#define EOK 0 +#endif + +#endif /* _SYS_ERRNO_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/sched.h new file mode 100755 index 0000000000000..2acc34d821725 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/sched.h @@ -0,0 +1,67 @@ +#ifndef _POSIX_SCHED_H_ +#define _POSIX_SCHED_H_ + +/*========================================================================== + * FILE: sched.c + * + * SERVICES: POSIX Thread sched API interface + * + * DESCRIPTION: POSIX Thread sched API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCHED_FIFO 0 /* First in, first out (FIFO) scheduling policy. */ +#define SCHED_RR 1 /* Round robin scheduling policy. */ +#define SCHED_SPORADIC 2 /* Sporadic server scheduling policy. */ +#define SCHED_OTHER 3 /* Another scheduling policy. */ + +typedef struct sched_param sched_param; +struct sched_param +{ + void *unimplemented; + int sched_priority; +}; + +/** \details + * This provides POSIX sched API. + */ + +/** \defgroup sched POSIX sched API */ +/** \ingroup sched */ +/** @{ */ + +/** Relinquish the CPU. + * Please refer to POSIX standard for details. + */ +static inline int sched_yield(void) +{ + return 0; +} + +/** Get the maximum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_max(int policy); + +/** Get the minimum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_min(int policy); + +/** @} */ +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SCHED_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/types.h new file mode 100755 index 0000000000000..700026f9f9e4e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/types.h @@ -0,0 +1,35 @@ +#ifndef _SYS_TYPES_H_ +#define _SYS_TYPES_H_ + +/*========================================================================== + * FILE: types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#if !defined( _PID_T ) || !defined( __pid_t_defined ) +/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header + defines it as unsigned 32-bit type citing conflict with QuRT POSIX + compatibility later. If any such conflicts exist, we should fix them. + pid_t is being defined *BEFORE* inclusion of generic/sys/types.h + *INTENTIONALLY* to fix this */ +typedef int pid_t; +#define _PID_T +#define __pid_t_defined +#endif +#include +#include +#include +#include + +#ifndef __DEFINED_off_t +typedef long off_t; +#define __DEFINED_off_t +#endif + +#endif /* _SYS_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/time.h new file mode 100755 index 0000000000000..13aeb1ea9920d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/time.h @@ -0,0 +1,142 @@ +#ifndef _POSIX_TIME_H_ +#define _POSIX_TIME_H_ + +/*========================================================================== + * FILE: time.h + * + * SERVICES: POSIX Timer API interface + * + * DESCRIPTION: POSIX Timer API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *==========================================================================*/ + + +#include + +typedef int clockid_t; /* ignored */ +#define _CLOCKID_T +#define _PROVIDE_POSIX_TIME_DECLS 1 +#include +/* @todo anandj sys/time.h has definition for struct timeval but is not + included by generic/time.h */ +#include + +#define CLOCK_FREQ_NOT_DEFINED -1 +/* Frequency of Sclk used */ +#define TIME_CONV_SCLK_FREQ 19200000 + +#define RES_CONV_FACTOR1 1 +#define RES_CONV_FACTOR2 1000000000 + +#if !defined(CLOCK_REALTIME) +# define CLOCK_REALTIME 0 +#endif + +#if !defined(CLOCK_MONOTONIC) +# define CLOCK_MONOTONIC 1 +#endif + +#if !defined(CLOCK_THREAD_CPUTIME_ID) +# define CLOCK_THREAD_CPUTIME_ID 2 +#endif + +#if !defined(CLOCK_PROCESS_CPUTIME_ID) +# define CLOCK_PROCESS_CPUTIME_ID 3 +#endif + +#if !defined(CLOCK_MONOTONIC_RAW) +# define CLOCK_MONOTONIC_RAW 4 +#endif + +#if !defined(CLOCK_REALTIME_COARSE) +# define CLOCK_REALTIME_COARSE 5 +#endif + +#if !defined(CLOCK_MONOTONIC_COARSE) +# define CLOCK_MONOTONIC_COARSE 6 +#endif + +#if !defined(CLOCK_BOOTTIME) +# define CLOCK_BOOTTIME 7 +#endif + +struct itimerspec +{ + struct timespec it_interval; /* Timer period. */ + struct timespec it_value; /* Timer expiration. */ +}; + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Timer functions */ + +/** \details + * POSIX timers can be either of two types: a one-shot type or a periodic + * type. + * + * A one-shot is an armed timer that is set to an expiration time relative + * to either a current time or an absolute time. The timer expires once and + * is disarmed. + * + * A periodic timer is armed with an initial expiration time and a repetition + * interval. Every time the interval timer + * expires, the timer is reloaded with the repetition interval. The timer + * is then rearmed. + */ + +/** \defgroup timer POSIX Timer API */ + +/** \ingroup timer */ +/** @{ */ + +/** Create a POSIX timer. + * Please refer to POSIX standard for details. + * @param clockid [in] ignored in this implementation + * @param evp [in] if non-NULL, points to a sigevent structure. This + * structure, allocated by the application, defines the asynchronous + * notification to occur when the timer expires. If the evp argument is + * NULL, the effect is as if the evp argument pointed to a sigevent + * structure with the sigev_notify member having the value SIGEV_SIGNAL, + * the sigev_signo having a default signal number (SIGALRM), and the + * sigev_value member having the value of the timer ID. + */ +int timer_create(clockid_t clockid, struct sigevent *restrict evp, + timer_t *restrict timerid); + +/** Delete a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_delete(timer_t timerid); + +/** Get the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_gettime(timer_t timerid, struct itimerspec *value); + + +/** Set the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + * @param flags [in] ignored in this implementation + */ +int timer_settime(timer_t timerid, int flags, + const struct itimerspec *restrict value, + struct itimerspec *restrict ovalue); +/** Obtain ID of a process CPU-time clock + * @param pid [in] Process ID + * @param clock_id [out] Clock ID + * @return Error values as per POSIX standard + */ +int clock_getcpuclockid (pid_t pid, clockid_t * clock_id); +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_TIME_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qube/qube.h new file mode 100755 index 0000000000000..1e31e2deedb38 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qube/qube.h @@ -0,0 +1,51 @@ +#ifndef QUBE_H +#define QUBE_H +/*============================================================================= + + qube.h -- H E A D E R F I L E + +GENERAL DESCRIPTION + Prototypes of qpd API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Define Error codes as QuRT error codes preceed with QURT_ */ +#ifndef EOK +#define EOK QURT_EOK +#endif /* EOK */ +#ifndef EVAL +#define EVAL QURT_EVAL +#endif /* EVAL */ +#ifndef EMEM +#define EMEM QURT_EMEM +#endif /* EMEM */ +#ifndef EINVALID +#define EINVALID QURT_EINVALID +#endif /* EINVALID */ + + +/*============================================================================= + FUNCTION DECLARATIONS +=============================================================================*/ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QUBE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops.h new file mode 100755 index 0000000000000..0a9a9f8ba7db5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops.h @@ -0,0 +1,197 @@ +#ifndef ATOMIC_OPS_H +#define ATOMIC_OPS_H +/** + @file atomic_ops.h + + @brief Type definitions backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * Author: Carlos Dyonisio + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned int atomic_plain_word_t; + +/*-------------------------------------------------------------------------*/ + /* Atomic Ops API. */ + +/* + * IMPORTANT! + * If you plan to change the structure atomic_word_t, please add the new + * elements after value. For more information, read the comment in + * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66 + */ + +typedef struct { + volatile atomic_plain_word_t value; +} atomic_word_t; + +#define ATOMIC_INIT(i) { (i) } + +static inline void +atomic_init(atomic_word_t *a, atomic_plain_word_t v) +{ + a->value = v; +} + +#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \ + (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP)) + +/* + * If it is ARMv4/v5, the function declarations may change + * and are defined in the arch specific header file, + * as some of then cannot be declared static because of + * the assembler implementation. + */ + +#else + +/* Arithmetic operations. */ + +void atomic_sub(atomic_word_t *target, atomic_plain_word_t v); + +/* Architecture independent definitions. */ + +static inline atomic_plain_word_t atomic_read(atomic_word_t *target) +{ + return target->value; +} + +typedef unsigned long long atomic64_plain_word_t; + +typedef struct { + volatile atomic64_plain_word_t value; +} atomic64_word_t; + +static inline void +atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v) +{ + a->value = v; +} + +/********************* + Support 64-bit + *********************/ + +atomic64_plain_word_t atomic64_set(atomic64_word_t* target, + atomic64_plain_word_t value); + +void atomic64_xor(atomic64_word_t* target, + atomic64_plain_word_t mask); + +/*---------------------------------------------------------------------------*/ + +/* Architecture independent definitions. */ + +static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target) +{ + return target->value; +} + +#endif + + +/* Architecture dependent definitions. */ +#include + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops_plat.h new file mode 100755 index 0000000000000..b54b3ff83d978 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops_plat.h @@ -0,0 +1,86 @@ +#ifndef ATOMIC_OPS_PLAT_H +#define ATOMIC_OPS_PLAT_H +/** + @file atomic_ops_plat.h + + @brief Prototypes of atomic operations API backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define atomic_set(a,b) qurt_atomic_set((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and(a,b) qurt_atomic_and((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and_return(a,b) qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or(a,b) qurt_atomic_or((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or_return(a,b) qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor(a,b) qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor_return(a,b) qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_set_bit(a,b) qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_clear_bit(a,b) qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_change_bit(a,b) qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add(a,b) qurt_atomic_add((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_return(a,b) qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_unless(a,b,c) qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_sub(a,b) qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b)) +#define atomic_sub_return(a,b) qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_inc(a) qurt_atomic_inc((unsigned int *)(a)) +#define atomic_inc_return(a) qurt_atomic_inc_return((unsigned int *)(a)) +#define atomic_dec(a) qurt_atomic_dec((unsigned int *)(a)) +#define atomic_dec_return(a) qurt_atomic_dec_return((unsigned int *)(a)) +#define atomic_compare_and_set(a,b,c) qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_barrier qurt_atomic_barrier +#define atomic_barrier_write qurt_atomic_barrier_write +#define atomic_barrier_write_smp qurt_atomic_barrier_write_smp +#define atomic_barrier_read_smp qurt_atomic_barrier_read_smp +#define atomic_barrier_smp qurt_atomic_barrier_smp + +/*============================ + * 64 bits support + *============================ */ +#define atomic64_set(a,b) qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and(a,b) qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and_return(a,b) qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or(a,b) qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or_return(a,b) qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor(a,b) qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor_return(a,b) qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_set_bit(a,b) qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_clear_bit(a,b) qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_change_bit(a,b) qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add(a,b) qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add_return(a,b) qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub(a,b) qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub_return(a,b) qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_inc(a) qurt_atomic64_inc((unsigned long long *)(a)) +#define atomic64_inc_return(a) qurt_atomic64_inc_return((unsigned long long *)(a)) +#define atomic64_dec(a) qurt_atomic64_dec((unsigned long long *)(a)) +#define atomic64_dec_return(a) qurt_atomic64_dec_return((unsigned long long *)(a)) +#define atomic64_compare_and_set(a,b,c) qurt_atomic64_compare_and_set((unsigned long long *)(a),(unsigned long long )(b),(unsigned long long )(c)) +#define atomic64_barrier qurt_atomic64_barrier +#define atomic64_barrier_write qurt_atomic64_barrier_write +#define atomic64_barrier_write_smp qurt_atomic64_barrier_write_smp +#define atomic64_barrier_read_smp qurt_atomic64_barrier_read_smp +#define atomic64_barrier_smp qurt_atomic64_barrier_smp + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_PLAT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt.h new file mode 100755 index 0000000000000..4d25c9b2b6243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt.h @@ -0,0 +1,111 @@ +#ifndef QURT_H +#define QURT_H + +/** + @file qurt.h + @brief Contains kernel header files that provide kernel OS API functions, constants, and + definitions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013,2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +/*====================================================================== + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Notice that changes are listed in reverse chronological + * order. + * + * + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------ + * 2011-02-25 op Add Header file + 2012-12-16 cm (Tech Pubs) Edited/added Doxygen comments and markup. + ======================================================================*/ + + +#ifdef __cplusplus +extern "C" { +#endif + +#include "qurt_consts.h" +#include "qurt_api_version.h" +#include "qurt_alloc.h" +#include "qurt_futex.h" +#include "qurt_mutex.h" +#include "qurt_pipe.h" +#include "qurt_printf.h" +#include "qurt_assert.h" +#include "qurt_thread.h" +#include "qurt_trace.h" +#include "qurt_cycles.h" +#include "qurt_profile.h" +#include "qurt_sem.h" +#include "qurt_cond.h" +#include "qurt_barrier.h" +#include "qurt_fastint.h" +#include "qurt_allsignal.h" +#include "qurt_anysignal.h" +#include "qurt_signal.h" +#include "qurt_rmutex.h" +#include "qurt_pimutex.h" +#include "qurt_signal2.h" +#include "qurt_rmutex2.h" +#include "qurt_pimutex2.h" +#include "qurt_int.h" +#include "qurt_lifo.h" +#include "qurt_power.h" +#include "qurt_event.h" +#include "qurt_pmu.h" +#include "qurt_stid.h" +//#include "qurt_version.h" +#include "qurt_tlb.h" +#include "qurt_vtlb.h" +#include "qurt_memory.h" +#include "qurt_qdi.h" +#include "qurt_sclk.h" +#include "qurt_space.h" +#include "qurt_process.h" +#include "qurt_timer.h" +#include "qurt_tls.h" +#include "qurt_thread_context.h" +#include "qurt_hvx.h" +#include "qurt_hmx.h" +#include "qurt_mailbox.h" +#include "qurt_island.h" +#include "qurt_qdi_proxy.h" +#include "qurt_l2cfg.h" +#include "qurt_mmap.h" +#include "qurt_isr.h" +#include "qurt_busywait.h" +#include "qurt_ecc.h" +#include "qurt_callback.h" +#include "qurt_error.h" +#include "qurt_except.h" +#include "qurt_mq.h" +#include "qurt_user_dma.h" +#include "qurt_fs_hub.h" +#include "qurt_os_services.h" + +#ifndef MAIN_ONLY +#define INCLUDE_ISLAND_CONTENTS +#endif +#ifndef ISLAND_ONLY +#define INCLUDE_MAIN_CONTENTS +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_alloc.h new file mode 100755 index 0000000000000..da37a4c0a714e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_alloc.h @@ -0,0 +1,145 @@ +#ifndef QURT_ALLOC_H +#define QURT_ALLOC_H + +/** + @file qurt_alloc.h + @brief Prototypes of kernel memory allocation API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_malloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated memory area. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] size Size (in bytes) of the memory area. + + @return + Nonzero -- Pointer to the allocated memory area. \n + 0 -- Not enough memory in heap to allocate memory area. + + @dependencies + None. + + */ +/* ======================================================================*/ +void *qurt_malloc( unsigned int size); + +/*======================================================================*/ +/**@ingroup func_qurt_calloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated array. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] elsize Size (in bytes) of each array element. + @param[in] num Number of array elements. + + @return + Nonzero -- Pointer to allocated array.\n + Zero -- Not enough memory in heap to allocate array. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_calloc(unsigned int elsize, unsigned int num); + +/*======================================================================*/ +/**@ingroup func_qurt_realloc + Reallocates memory on the heap. \n + Changes the size of a memory area that is already allocated on the QuRT system heap. + The reallocate memory operation is functionally similar to realloc. It accepts a pointer + to an existing memory area on the heap, and resizes the memory area to the specified size + while preserving the original contents of the memory area. + + @note1hang This function might change the address of the memory area. + If the value of ptr is NULL, this function is equivalent to + qurt_malloc(). + If the value of new_size is 0, it is equivalent to qurt_free(). + If the memory area is expanded, the added memory is not initialized. + + @param[in] *ptr Pointer to the address of the memory area. + @param[in] newsize Size (in bytes) of the reallocated memory area. + + @return + Nonzero -- Pointer to reallocated memory area. \n + 0 -- Not enough memory in heap to reallocate the memory area. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_realloc(void *ptr, int newsize); + +/*======================================================================*/ +/**@ingroup func_qurt_free + Frees allocated memory from the heap.\n + Deallocates the specified memory from the QuRT system heap. + + @param[in] *ptr Pointer to the address of the memory to deallocate. + + @return + None. + + @dependencies + The memory item that the ptr value specifies must have been previously + allocated using one of the qurt_calloc(), + qurt_malloc(), or qurt_realloc() memory allocation functions. + Otherwise the behavior of QuRT is undefined. + + */ + /* ======================================================================*/ +void qurt_free( void *ptr); + + +void *qurt_memalign(unsigned int alignment, unsigned int size); + +/* +|| Macro to define a static heap for a QuRT program. +|| +|| Usage: +|| Declare at the top-level of any C source file that +|| is part of the build (and is guaranteed +|| to actually be pulled into the build). Place +|| it in the same function with main(): +|| +|| QURT_DECLARE_STATIC_HEAP(512000); +|| +|| The only argument is the size in bytes, and it is +|| rounded up to the nearest 64 bytes (size of an +|| L2 cache block). +|| +*/ + +#define QURT_DECLARE_STATIC_HEAP(sz) \ + static struct qurt_static_heap { \ + char space[(sz)] __attribute__((aligned(64))); \ + } static_heap[1]; \ + void * const override_heap_Base = &static_heap[0]; \ + void * const override_heap_Limit = &static_heap[1] + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLOC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_allsignal.h new file mode 100755 index 0000000000000..5dc89e495130d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_allsignal.h @@ -0,0 +1,176 @@ + +#ifndef QURT_ALLSIGNAL_H +#define QURT_ALLSIGNAL_H + +/** + @file qurt_allsignal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup all_signal_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** +qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int waiting; /**< */ + unsigned int signals_in; /**< */ + unsigned int queue; /**< */ + unsigned int reserved; /**< */ + }X; + /** @endcond */ +} qurt_allsignal_t; +/** @} */ /* end_addtogroup all_signal_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_init + Initializes an all-signal object.\n + The all-signal object is initially cleared. + + @datatypes + #qurt_allsignal_t + + @param[out] signal Pointer to the all-signal object to initialize. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_init(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_destroy + Destroys the specified all-signal object.\n + @note1hang All-signal objects must be destroyed when they are no longer in use. + Failure to do this causes resource leaks in the QuRT kernel. \n + @note1cont All-signal objects must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_destroy(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_get + Gets signal values from the all-signal object. + + Returns the current signal values of the specified all-signal object. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to access. + + @return + Bitmask with current signal values. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal) +{ return signal->X.signals_in; } + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_wait + Waits on the all-signal object.\n + Suspends the current thread until all of the specified signals are set. + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 that it is not to be waited on. + + If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal + object before waiting on them again -- clearing is done automatically by the wait + operation. + + @note1hang At most, one thread can wait on an all-signal object at any given time. + Because signal clearing is done by the wait operation, no clear operation is + defined for all-signals. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to wait on. + @param[in] mask Signal mask value, which identifies the individual signals in the all-signal object + to wait on. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_set + Set signals in the specified all-signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit + value of 1 indicates that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the all-signal object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLSIGNAL_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_anysignal.h new file mode 100755 index 0000000000000..9619e2de562b4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_anysignal.h @@ -0,0 +1,225 @@ +#ifndef QURT_ANYSIGNAL_H +#define QURT_ANYSIGNAL_H +/** + @file qurt_anysignal.h + Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +Copyright (c) 2021 Qualcomm Technologies, Inc. +All rights reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== +Typedefs +======================================================================*/ + +/**@ingroup anysignals_types + qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility. */ +typedef qurt_signal_t qurt_anysignal_t; + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_init + Initializes an any-signal object.\n + The any-signal object is initially cleared. + + @datatypes + #qurt_anysignal_t + + @param[out] signal Pointer to the initialized any-signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_init(qurt_anysignal_t *signal) +{ + qurt_signal_init(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_destroy + Destroys the specified any-signal object. + + @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Any-signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal) +{ + qurt_signal_destroy(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait + Wait on the any-signal object. \n + Suspends the current thread until any one of the specified signals is set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + + @return + Bitmask of current signal values. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_set + Sets signals in the specified any-signal object. \n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be set, and 0 indicates not to set the sigmal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the any-signal object. + + @return + Bitmask of old signal values (before set). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask); + + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_get + Gets signal values from the any-signal object.\n + Returns the current signal values of the specified any-signal object. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to access. + + @return + A bitmask with the current signal values of the specified any-signal object. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal) +{ + return qurt_signal_get(signal); +} + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_clear + @xreflabel{sec:anysignal_clear} + Clears signals in the specified any-signal object.\n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + clear in the any-signal object. + + @return + Bitmask -- Old signal values (before clear). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait_timed + Waits on the any-signal object. \n + Suspends the current thread until any of the specified signals is set or timeout expires. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + @param[out] signals Bitmask of current signal values. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- timeout + #QURT_EINVALID -- Duration out of range + + @dependencies + None. + */ +/* ======================================================================*/ + +int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ANYSIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_api_version.h new file mode 100755 index 0000000000000..dfe53ae755054 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_api_version.h @@ -0,0 +1,77 @@ +#ifndef QURT_API_VERSION_H +#define QURT_API_VERSION_H +/*============================================================================== + +qurt_api_version.h + +GENERAL DESCRIPTION + API version file + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +/*============================================================================== + CONSTANTS AND DEFINITIONS +==============================================================================*/ +/** + * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer. + * Main release has first 3 fields updated - Major, Minor and Release. + * - QURT_API_VERSION = Major, Minor, Release. + * Patch releases are supported by adding the extra field. + * - QURT_API_VERSION = Major, Minor, Release, Patch. + */ +// Major version is incremented for incompatible API changes. +#define QURT_API_VER_MAJOR 1 + +// Minor version is incremented for backward-compatible enhancements in the API +// set. +#define QURT_API_VER_MINOR 4 + +// RELEASE version is incremented for each release within a `MAJOR.MINOR` +// release. +#define QURT_API_VER_RELEASE 1 + +// Patch version is incremented when new API content is introduced on older LTS +// release. +#define QURT_API_VER_PATCH 0 + +/* Update the QURT_API_VERSION function macro. */ +#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \ + ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \ + (((release) & 0xFF) << 8) | ((patch) & 0xFF)) + +/* Update the QURT_API_VERSION Macro. */ +#define QURT_API_VERSION \ + QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \ + QURT_API_VER_RELEASE, QURT_API_VER_PATCH) + +/** Usage: + * + * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0) + * qurt_func_2(a,b,c); + * #else + * qurt_func(a); + * #endif + * + */ +/* + Gets the QuRT API version. + + @return + QuRT API version. + + @dependencies + None. + */ +unsigned int qurt_api_version(void); + +#endif /* QURT_API_VERSION_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_assert.h new file mode 100755 index 0000000000000..13cc2afd2e973 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_assert.h @@ -0,0 +1,51 @@ +#ifndef QURT_ASSERT_H +#define QURT_ASSERT_H +/** + @file qurt_assert.h + @brief Prototypes of qurt_assert API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@ingroup func_qurt_assert_error + Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel. + + @datatypes + None. + + @param[in] filename Pointer to the file name string. + @param[in] lineno Line number. + + @return + None. + + @dependencies + None. + */ +void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn)); + +#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__)) + +/** @} */ /* end_ingroup func_qurt_assert */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ASSERT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_atomic_ops.h new file mode 100755 index 0000000000000..d9b2cff7d737c --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_atomic_ops.h @@ -0,0 +1,1298 @@ +#ifndef QURT_ATOMIC_OPS_H +#define QURT_ATOMIC_OPS_H +/** + @file qurt_atomic_ops.h + @brief Prototypes of kernel atomic operations API. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * + * This file is only included by the main atomic_ops.h, so all of that + * file's definitions are available. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +///* Sanity check to ensure the smp flag is set in machines.py */ +//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1 +//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py. +//#endif +#define QURT_INLINE __attribute__((always_inline)) + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_atomic_set + Sets the atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value Value to set. + + @return + Value successfuly set. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_set(unsigned int* target, unsigned int value) +{ + unsigned long tmp; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " memw_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic_and + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + None + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_and(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_and_return + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + AND result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_and_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_or + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_or(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_or_return + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + Returns the OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_or_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_xor + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_xor(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_xor_return + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_xor_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_set_bit + Sets a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_set_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_clear_bit + Clears a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_clear_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_change_bit + Toggles a bit in a atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_change_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1fU; + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_add(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add_return + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_add_unless + Adds the delta value to an atomic variable unless the current value in the target + matches the unless variable. + + @note1hang The function retries until load lock and store conditional + are successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] delta Value to add to the current value. + @param[in] unless Perform the addition only when the current value is not + equal to this unless value. + @return + TRUE -- 1 - Addition was performed. \n + FALSE -- 0 - Addition was not done. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_unless(unsigned int* target, + unsigned int delta, + unsigned int unless) +{ + unsigned int current_val; + unsigned int new_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%3)\n" + " p0 = cmp.eq(%0, %5)\n" + " if p0 jump 2f\n" + " %1 = add(%0, %4)\n" + " memw_locked(%3, p0) = %1\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"=&r" (new_val),"+m" (*target) + : "r" (target), "r" (delta), "r" (unless) + : "p0"); + + return (unsigned int)(current_val != unless); +} + +/**@ingroup func_qurt_atomic_sub + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_sub(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_sub_return + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_sub_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_inc + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_inc(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_inc_return + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_inc_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_dec + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_dec(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_dec_return + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_dec_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_compare_and_set + Compares the current value of the atomic variable with the + specified value and set to a new value when compare is successful. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val Old value to compare. + @param[in] new_val New value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE --Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_compare_and_set(unsigned int* target, + unsigned int old_val, + unsigned int new_val) +{ + unsigned int current_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memw_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (unsigned int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic_barrier + Allows the compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_barrier(void) +{ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); +} + + +/**@ingroup func_qurt_atomic64_set + Sets the 64-bit atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value 64-bit value to set. + + @return + Successfuly set value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_set(unsigned long long* target, unsigned long long value) +{ + unsigned long long tmp; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " memd_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic64_and_return + Bitwise AND operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise AND. + + @return + AND result of 64-bit atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_or + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_or(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_or_return + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_xor_return + Bitwise XOR operation of 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_set_bit + Sets a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_clear_bit + Clears a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_change_bit + Toggles a bit in a 64-bit atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_add(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add_return + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_add_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_sub_return + Subtracts a 64-bit integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_inc + Increments a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_inc(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_inc_return + Increments a 64-bit atomic variable by one + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_inc_return(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_dec_return + Decrements a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_dec_return(unsigned long long *target) +{ + unsigned long long result; + long long minus1 = 0xFFFFFFFFFFFFFFFFLL; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (minus1) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_compare_and_set + Compares the current value of an 64-bit atomic variable with + the specified value and sets to a new value when compare is successful. + + @note1hang The function keep retrying until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val 64-bit old value to compare. + @param[in] new_val 64-bit new value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE -- Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE int +qurt_atomic64_compare_and_set(unsigned long long *target, + unsigned long long old_val, + unsigned long long new_val) +{ + unsigned long long current_val; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memd_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic64_barrier + Allows compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_barrier(void) +{ + /** @cond */ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); + /** @endcond */ +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_barrier.h new file mode 100755 index 0000000000000..7c6f787d43bc2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_barrier.h @@ -0,0 +1,140 @@ +#ifndef QURT_BARRIER_H +#define QURT_BARRIER_H + +/** + @file qurt_barrier.h + @brief Prototypes of Kernel barrier API functions. + + EXTERNALIZED FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup barrier_types +@{ */ +/*===================================================================== + Constants and macros +======================================================================*/ +#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */ +#define QURT_BARRIER_OTHER 0 /**< Other. */ + +#ifndef ASM +#include + +/*===================================================================== +Typedefs +======================================================================*/ + +/** QuRT barrier type. + */ +typedef union { + /** @cond */ + struct { + unsigned short threads_left; + unsigned short count; + unsigned int threads_total; + unsigned int queue; + unsigned int reserved; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_barrier_t; + +/** @} */ /* end_addtogroup barrier_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_init + Initializes a barrier object. + + @datatypes + #qurt_barrier_t + + @param[out] barrier Pointer to the barrier object to initialize. + @param[in] threads_total Total number of threads to synchronize on the barrier. + + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_destroy + Destroys the specified barrier. + + @note1hang Barriers must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Barriers must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to destroy. + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_destroy(qurt_barrier_t *barrier); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_wait + Waits on the barrier.\n + Suspends the current thread on the specified barrier. \n + The function return value indicates whether the thread was the last one to + synchronize on the barrier. + When a thread waits on a barrier, it is suspended on the barrier: \n + - If the total number of threads waiting on the barrier is less than the assigned value + of the barrier, no other action occurs. \n + - If the total number of threads waiting on the barrier equals the assigned value of the + barrier, all threads currently waiting on the barrier are awakened, allowing them to + execute past the barrier. + + @note1hang After its waiting threads are awakened, a barrier is automatically reset + and can be used again in the program without the need for re-initialization. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to wait on. + + @return + #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n + #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_wait(qurt_barrier_t *barrier); + + +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BARRIER_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_busywait.h new file mode 100755 index 0000000000000..a4dab80a2520a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_busywait.h @@ -0,0 +1,62 @@ +#ifndef QURT_BUSYWAIT_H +#define QURT_BUSYWAIT_H + +/** + @file qurt_busywait.h + @brief Implementation of the busywait() function for + hardware based blocking waits that use the QTIMER as a reference. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ============================================================================*/ +/*============================================================================= + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Changes are listed in reverse chronological + * order. + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------------- + * 2018-03-20 pg Add Header file + ============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_busywait + Pauses the execution of a thread for a specified time.\n + Use for small microsecond delays. + + @note1hang The function does not return to the caller until + the time duration has expired. + + @param[in] pause_time_us Time to pause in microseconds. + + @return + None. + + @dependencies + None. + */ +void qurt_busywait (unsigned int pause_time_us); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BUSYWAIT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_callback.h new file mode 100755 index 0000000000000..dc9b896c63454 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_callback.h @@ -0,0 +1,235 @@ +#ifndef QURT_CALLBACK_H +#define QURT_CALLBACK_H + +/** + @file qurt_callback.h + Definitions, macros, and prototypes for QuRT callback framework. + + QDI framework allows the development of root process drivers and services that + a user process client can interact with in a secure manner. QDI framework does + this by elevating the priviledge of user process thread, temporarily allowing + the thread execute in root context and letting it fall back to user context once + the QDI invocation is finished. + + The QuRT callback framework provides a safe mechanism for root process drivers + to execute callback functions in a user process. The framework hosts + dedicated worker threads in corresponding processes that handle the execution + of the callback function. This ensures that the callbacks occur in context of + the appropriate process thread, in result maintaining privilege boundaries. + + Prerequisites for use of this framework are: + 1. Driver is a QDI driver and client communicates with drivers using QDI + invocations. + 2. Appropriate callback configuration is specified in cust_config.xml for + the user process that intends to use this framework. + + qurt_cb_data_t is the public data structure that allows client to store all + the required information about the callback, including the callback function + and the arguments to pass to this function when it executes. + The client uses QDI interface to register this structure with root driver. + + Callback framework provides following APIs that a root driver can use to invoke callback. + These functions are described in qurt_qdi_driver.h header file. + + qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the + invoking thread does not wait for the callback to finish executing. + + qurt_qdi_cb_invoke_sync() triggers a synchronous callback. Upon invocation + the invoking thread gets suspended till the callback function finishes execution. + + qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to + qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with + the callback invocation to be utlized during the callback execution. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int qurt_cb_result_t; + +/* Callback framework error codes. + Callback framework returns a nonzero value if callback invocation is unsuccessful. + Following macros highlight cause of failure in more detail. +*/ +#define QURT_CB_ERROR -1 /* Callback registration failed.\n*/ +#define QURT_CB_OK 0 /* Success.\n*/ +#define QURT_CB_MALLOC_FAILED -2 /* QuRTOS malloc failure.\n*/ +#define QURT_CB_WAIT_CANCEL -3 /* Process exit cancelled wait operation.\n*/ +#define QURT_CB_CONFIG_NOT_FOUND -4 /* Callback configuration for process was not found.\n*/ +#define QURT_CB_QUEUE_FULL -5 /* Callback queue is serving at maximum capacity.*/ +/** @addtogroup cb_types +@{ */ +/** Callback registration data structure. + This data structure is used by a client attempting to register a callback with a QDI driver. + It holds the address of callback function and the argument supplied to the callback + function when it executes. +*/ +typedef struct { + /** @cond */ + void* cb_func; /*< Pointer to the callback function. */ + unsigned cb_arg; /*< Not interpreted by the framework.*/ + /** @endcond */ +} qurt_cb_data_t; + +/** @cond */ +/* Defines used as default if cust_config does not specify them. */ +#define CALLBACK_WORKER_STACK_SIZE 0x2000 +/** @endcond */ +/** @} */ /* end_addtogroup cb_typess */ +/**@ingroup func_qurt_cb_data_init + Initializes the callback data structure. + Entity registering a callback with the root process driver must call this function + to initialize callback registration data structure to the default value. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){ + cb_data->cb_func = NULL; + cb_data->cb_arg = 0; +} + +/**@ingroup func_qurt_cb_data_set_cbfunc + Sets up the callback function in the callback registration data structure. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_func Pointer to the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){ + cb_data->cb_func = cb_func; +} + +/**@ingroup func_qurt_cb_data_set_cbarg + Sets up the callback argument. + This function sets up the argument passed to the callback function when it executes. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_arg Argument for the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){ + cb_data->cb_arg = cb_arg; +} + +/** @cond */ +/**@ingroup driver_support_functions + Invokes an asynchronous callback for a specified process. + A driver that resides in the root process calls this API to launch a callback in + a process described by the client_handle. + After the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is not suspended during the callback execution period. + The API returns immediately with a success/failure error code. + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process. + A driver that resides in a root process calls this API to launch a sync callback in + a process described by the client_handle. + AFter the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is suspended during the callback execution period. + If the process in which to execute the callback exits or terminates, the caller is + woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h). + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process, passing driver data to the user PD. + This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to + the user process as part of the callback invocation. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @param data Driver arbitrary data to pass to the user process. Memory pointed to by data + must be accessible to the user PD. The root driver can allocate such memory by + using qurt_mem_mmap(). + @param data_len Driver arbitrary data length. + + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle, + qurt_cb_data_t* cb_data, + int prio, + void *data, + unsigned data_len + ); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_clade.h new file mode 100755 index 0000000000000..d7442cf98dd94 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_clade.h @@ -0,0 +1,62 @@ +#ifndef QURT_CLADE_H +#define QURT_CLADE_H +/** + @file qurt_clade.h + @brief Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API. + CLADE is a cache line level memory compression system that is used to + decrease DRAM usage. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_clade2_get + Reads the value of the clade2 register. + + @param[in] offset Offset from the clade2 cfg base. + @param[out] *value Pointer to the register value read from the offset. + + @return + #QURT_EOK - Successfully read the value from the register at offset \n + #QURT_EINVALID - Offset passed is incorrect + + @dependencies + None. + */ +int qurt_clade2_get(unsigned short offset, unsigned int *value); + +/**@ingroup func_qurt_clade2_set + Sets the PMU register; only PMU_SEL register can be set. + + @param[in] offset Offset from the QURTK_clade2_cfg_base. + @param[in] value Value to set at offset. + + @return + #QURT_EOK -- Successfully set the value at offset. \n + #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG. + + @dependencies + None. + */ +int qurt_clade2_set(unsigned short offset, unsigned int value); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CLADE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cond.h new file mode 100755 index 0000000000000..6e65ed82a8393 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cond.h @@ -0,0 +1,219 @@ +#ifndef QURT_COND_H +#define QURT_COND_H +/** + @file qurt_cond.h + @brief Prototypes of kernel condition variable object API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup condition_variables_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** QuRT condition variable type. */ +typedef union { + /** @cond */ + unsigned long long raw; + struct { + unsigned int count; + unsigned int n_waiting; + unsigned int queue; + unsigned int reserved; + }X; + /** @endcond */ +} qurt_cond_t; + +/** @} */ /* end_addtogroup condition_variables_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_cond_init + Initializes a conditional variable object. + + @datatypes + #qurt_cond_t + + @param[out] cond Pointer to the initialized condition variable object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_init(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_destroy + Destroys the specified condition variable. + + @note1hang Conditions must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Conditions must not be destroyed while they are still in use. If this occurs, + the behavior of QuRT is undefined. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to destroy. + + @return + None. + + */ +/* ======================================================================*/ +void qurt_cond_destroy(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_signal + Signals a waiting thread that the specified condition is true. \n + + When a thread wishes to signal that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the signal condition operation. \n + -# Unlock the mutex. + + @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_signal(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_broadcast + Signals multiple waiting threads that the specified condition is true.\n + When a thread wishes to broadcast that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the broadcast condition operation. \n + -# Unlock the mutex.\n + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_broadcast(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable (suspends the thread and unlocks the mutex). + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t \n + #qurt_mutex_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait2 + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable, which suspends the thread and unlocks the mutex. + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @note1cont This is the same API as qurt_cond_wait(), use this version + when using mutexes of type #qurt_rmutex2_t. + + @datatypes + #qurt_cond_t \n + #qurt_rmutex2_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with the condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_COND_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_consts.h new file mode 100755 index 0000000000000..b1e35998e73b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_consts.h @@ -0,0 +1,315 @@ +#ifndef QURT_CONSTS_H +#define QURT_CONSTS_H + +/** + @file qurt_consts.h + @brief QuRT constants and definitions + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* Definitions of system events. System events suspend + a thread and put it into suspending_list. + The system event number is saved in CONTEXT::error::cause field + of the suspended thread. An event handler thread such as + page fault handler or system error handler can wake up the suspended + thread. + */ +#define QURT_EVENT_PAGEFAULT 0x1 /* Page fault event. */ +#define QURT_EVENT_SYSTEM_ERR 0x2 /* System error event. */ +#define QURT_EVENT_SUSPEND 0x3 +#define QURT_EVENT_PROCESS_EXIT 0x4 /* Process termination event.*/ + +#define QURT_SYSENV_MAX_THREADS_TYPE 1 /* Maximum threads object. */ +#define QURT_SYSENV_PROCNAME_TYPE 2 /* Process name object. */ +#define QURT_SYSENV_MAX_PI_PRIO_TYPE 3 /* Maximum pi priority object. */ +#define QURT_SYSENV_ARCH_REV_TYPE 4 /* Architecture version object. */ +#define QURT_SYSENV_APP_HEAP_TYPE 5 /* Application heap object. */ +#define QURT_SYSENV_REGION_ATTR_DEFAULT 7 /* Default region attributes. */ +#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE 8 /* Stack profile count type. */ +#define QURT_SYSENV_ISLAND_CONFIG_TYPE 9 /*island configuration check*/ +#define QURT_SYSENV_HTHREADS_TYPE 10 /* Active threads objec */ +#define QURT_SYSENV_CONFIG_IMAGE_START_LO 11 /* Config image start address for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_START_HI 12 /* Config Image start address for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_LO 13 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_HI 14 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_PLATPARAMS 15 /* Platformparams for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_SIZE 16 /* Config image Size for DTB parsing */ +#define QURT_SYSENV_L2_CACHE_LINE_SIZE 17 /*L2 cache line size*/ + +/* Get q6 regs */ +#define QURT_GET_SSR 1 +#define QURT_GET_CCR 2 +#define QURT_GET_CFGBASE 3 +#define QURT_GET_SYSCFG 4 +#define QURT_GET_REV 5 + + +/** @cond rest_reg_dist */ +/** @addtogroup performance_monitor_macros +@{ */ + +/* PMU */ +#define QURT_PMUCNT0 0 /**< */ +#define QURT_PMUCNT1 1 /**< */ +#define QURT_PMUCNT2 2 /**< */ +#define QURT_PMUCNT3 3 /**< */ +#define QURT_PMUCFG 4 /**< */ +#define QURT_PMUEVTCFG 5 /**< */ + +/* new since V55 */ +#define QURT_PMUCNT4 6 /**< */ +#define QURT_PMUCNT5 7 /**< */ +#define QURT_PMUCNT6 8 /**< */ +#define QURT_PMUCNT7 9 /**< */ +#define QURT_PMUEVTCFG1 10 /**< */ + +/* new since V61 */ +#define QURT_PMUSTID0 11 /**< */ +#define QURT_PMUSTID1 12 /**< */ + +#define QURT_PMUCNTSTID0 13 /**< */ +#define QURT_PMUCNTSTID1 14 /**< */ +#define QURT_PMUCNTSTID2 15 /**< */ +#define QURT_PMUCNTSTID3 16 /**< */ +#define QURT_PMUCNTSTID4 17 /**< */ +#define QURT_PMUCNTSTID5 18 /**< */ +#define QURT_PMUCNTSTID6 19 /**< */ +#define QURT_PMUCNTSTID7 20 /**< */ + +/** @} */ /* end_addtogroup performance_monitor_macros */ +/** @endcond */ + +/* + Power collapse operation +*/ +#define QURT_POWER_SHUTDOWN 0 /**< */ +#define QURT_TCXO_SHUTDOWN 1 /**< */ +#define QURT_POWER_CMD_PREPARE 0 /**< */ +#define QURT_POWER_CMD_PERFORM 1 /**< */ +#define QURT_POWER_CMD_EXIT 2 /**< */ +#define QURT_POWER_CMD_FAIL_EXIT 3 /**< */ +#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */ +#define QURT_POWER_CMD_PERFORM_SAVE_TCM 5 /**< */ +#define QURT_POWER_CMD_DEEP_SLEEP 6 /**< */ + + +/** @addtogroup thread_macros +@{ */ +#define QURT_MAX_HTHREAD_LIMIT 8U /**< Limit on the maximum number of hardware threads supported by QuRT for any + Hexagon version. Use this definition to define arrays, and so on, in + target independent code. */ +/** @} */ /* end_addtogroup thread_macros */ + +/** @cond internal_only */ +/** @addtogroup power_management_macros +@{ */ +/** + L2 cache retention mode +*/ +#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_L2RET QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */ +/** @} */ /* end_addtogroup power_management_macros */ +/** @endcond */ + +/* + QURT_system_state + Use for debugging the shutdown/startup process. + + State transition for cold boot: + QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT --> + QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE --> + QURT_CBOOT_ROOT_TASK_STARTED + + State transition for power collapse: + QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND --> + QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC --> + cache flush states (dependent on L2 retention config) + + State transition for warm boot: + QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB --> + QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT +*/ +#define QURT_PREPARE_SINGLE_MODE 1 /**< */ +#define QURT_PREPARE_END 2 /**< */ +#define QURT_PERFORM_IPEND 3 /**< */ +#define QURT_PERFORM_SAVE_ISDP 4 /**< */ +#define QURT_PERFORM_SAVE_PMU 5 /**< */ +#define QURT_PERFORM_SAVE_TLB 6 /**< */ +#define QURT_PERFORM_SWITCH_PC 7 /**< */ +#define QURT_PERFORM_EXIT 8 /**< */ +#define QURT_FLUSH_L1CACHE 9 /**< */ +#define QURT_FLUSH_L2CACHE 0xA /**< */ +#define QURT_FLUSH_CACHE_DONE 0xB /**< */ +#define QURT_SWITCH_PC_DONE 0xC /**< */ +#define QURT_BOOT_SETUP_ISDB 0xD /**< */ +#define QURT_WBOOT_INIT_TLB 0xE /**< */ +#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */ +#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */ +#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */ +#define QURT_CBOOT_BSP_INIT 0x12 /**< */ +#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */ +#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */ +#define QURT_CBOOT_END_OS_INIT 0x15 /**< */ +#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */ +#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */ +#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */ +#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */ +#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */ +#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */ +#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */ +#define QURT_WBOOT_DEBUG_L2_END 0x1D /**< */ +#define QURT_NMI_SAVE_L2VIC_COMPLETE 0x1E /**< */ +#define QURT_NMI_HANDLER_COMPLETE 0x1F /**< */ +#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */ +#define QURT_WBOOT_START 0x21 /**< */ +#define QURT_ENTER_ISLAND 0x22 /**< */ +#define QURT_EXIT_ISLAND 0x23 /**< */ +#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */ +#define QURT_ABNORMAL_RESET 0x25 /**< */ +/* + Thread attributes +*/ + +#define QURT_THREAD_ATTR_GP 0x00000002 /*< */ +#define QURT_THREAD_ATTR_UGP 0x00000003 /*< User general pointer (UGP)*/ +#define QURT_THREAD_ATTR_PREFETCH 0x00000004 /*< */ +#define QURT_THREAD_ATTR_TID 0x00000005 /*< */ +#define QURT_THREAD_ATTR_CACHE_PART 0x00000007 /*< */ +#define QURT_THREAD_ATTR_COPROCESSOR 0x00000008 /*< */ +#define QURT_THREAD_ATTR_GET_L2CACHE_PART 0x00000009 /*< */ +#define QURT_THREAD_ATTR_SET_FRML 0x0000000A /*< */ +#define QURT_THREAD_ATTR_STID_GET 0x0000000B /*< */ +#define QURT_THREAD_ATTR_STID_SET 0x0000000C /*< */ +#define QURT_THREAD_ATTR_AUTOSTACK 0x0000000D /*< */ +#define QURT_THREAD_ATTR_SYSTEM_THREAD 0x0000000E /*< */ +#define QURT_THREAD_ATTR_STID_SET2 0x0000000F /*< */ +#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */ +#define QURT_THREAD_ATTR_STID_GET2 0x00000011 /*< */ + +/** Cache operations*/ +#define QURT_DCCLEAN 0U /* Clean Dcache. */ +#define QURT_DCINV 1U /* Invalidate Dcache. */ +#define QURT_DCCLEANINV 2U /* Clean and invalidate Dcache. */ +#define QURT_ICINV 3U /* Invalidate Icache. */ +#define QURT_DUMP_DCTAGS 4U /* For testing purpose. */ +#define QURT_FLUSH_ALL 5U /* Flush entire L1 and L2 cache. */ +#define QURT_TABLE_FLUSH 6U /* Flush based on table of physical pages */ +#define QURT_CLEAN_INVALIDATE_ALL 7U /* Flush and invalidate entire L1 and L2 cache. */ +#define QURT_L2CACHE_LOCK_LINES 8U /* l2 cache lock lines */ +#define QURT_L2CACHE_UNLOCK_LINES 9U /* l2 cache unlock lines */ +#define QURT_CLEAN 10U /* Flush L1 and L2 cache */ +#define QURT_CLEAN_INVALIDATE 11U /* Flush and invalidate L1 and L2 cache. */ +#define QURT_CLEAN_INVALIDATE_L2 12U /* Flush and invalidate entire L2 cache. */ + +/**@ingroup chapter_prefined_symbols */ +/**@xreflabel{hdr:QURT_API_VERSION}*/ + + +/* Process state. */ +#define QURT_UPDATE_PROCESS_STATE 0 /**< */ +#define QURT_MP_INIT 1 /*< */ +#define QURT_MP_RUNNING 2 /*< */ +#define QURT_MP_STOPPED 3 /*< */ + +/* QuRT reset reason. */ +#define QURT_NORMAL_BOOT 0 /* Normal boot. */ +#define QURT_WARM_BOOT 1 /* Power collapse warm boot. */ +#define QURT_WARM_BOOT_L2_RETENTION 2 /* Power collapse with L2 retention warm boot. */ +#define QURT_WARM_BOOT_SAVE_TCM 3 /* Power collapse with saving TCM. */ +#define QURT_QUICK_BOOT 4 /* Deep sleep. */ + +/* QuRT Wait for Idle command */ +#define QURT_WAIT_FOR_IDLE_DISABLE 0 /*< */ +#define QURT_WAIT_FOR_IDLE_ENABLE 1 /*< */ +#define QURT_WAIT_FOR_IDLE 2 /*< */ +#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */ + +/*QuRT island exit stages */ +#define QURT_ISLAND_EXIT_STAGE1 1 /*< */ +#define QURT_ISLAND_EXIT_STAGE2 2 /*< */ + +#define QURT_MAX_NAME_LEN 64 /*< */ + +#define MAX_POOL_RANGES 16 /*< */ + +/* key definitions for debug thread info */ +//#define MAX_TCB_KEY 40 //whatever is a good number or makes debug thread structure be 1K +#define KEY_SCHDULER_STATE 1 /*< */ +#define KEY_PRIORITY 2 /*< */ +#define KEY_PRIORITY_ORIG 3 /*< */ +#define KEY_STACK_BOTTOM 4 // Currently not populated +#define KEY_STACK_TOP 5 // Currently not populated +#define KEY_HVX_STATE 6 /*< */ +#define KEY_FUTEX_OBJECT 7 /*< */ +#define KEY_THREAD_ID 8 /*< */ +#define KEY_PROFILE_CYCLE_LO 9 // Currently not populated +#define KEY_PROFILE_CYCLE_HI 10 // Currently not populated +#define KEY_ERROR_ADDRESS 11 // This holds the BADVA +#define KEY_ERROR_CAUSE 12 // This is the same as QURT_error_info.cause +#define KEY_ERROR_CAUSE2 13 // This is the same as QURT_error_info.cause2 +#define KEY_ERROR_SSR 14 /*< Holds the SSR value */ +#define QURT_RESERVED -1 + +/* VTLB method IDs. */ +#define QURT_VTLB_ENTRY_CREATE 0U +#define QURT_VTLB_ENTRY_DELETE 1U +#define QURT_VTLB_ENTRY_READ 2U +#define QURT_VTLB_ENTRY_WRITE 3U +#define QURT_VTLB_ENTRY_PROBE 4U +#define QURT_VTLB_ENTRY_SPLIT 5U +#define QURT_VTLB_ENTRY_MERGE 6U +#define QURT_VTLB_ENTRY_STATISTICS 7U +#define QURT_VTLB_ENTRY_SET_SPECIAL 8U +#define QURT_VTLB_QUEUE_PPAGE 9U +#define QURT_VTLB_RECLAIM_STACK_PAGES 10U +#define QURT_VTLB_ASID_SET_STATE_FAST 11U +#define QURT_VTLB_ASID_SET_STATE 12U +#define QURT_VTLB_ENTRY_SET_EXTENSION 13U +#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U + +/* VTCM window access control HWIO programming. */ +#define QURT_VTCM_WINDOW_ENABLE 1U +#define QURT_VTCM_WINDOW_DISABLE 0U +#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT 0xFFFU +#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT 0U + +/** @cond */ +/* ETM source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< Memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< Memory source of SAC* is data. */ + +/* ETM PID status flags */ +#define QURT_ETM_NO_PID 0xFFFFFFFF /**< No PID is selected. */ +/** @endcond */ + +/* execution context */ +#define QURT_CTX_USER 1 +#define QURT_CTX_GUEST 2 + +/* Profiling STID */ +#define QURT_STID_DEFAULT 0U + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cycles.h new file mode 100755 index 0000000000000..b599493f5d563 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cycles.h @@ -0,0 +1,301 @@ + +#ifndef QURT_CYCLES_H +#define QURT_CYCLES_H 1 +/** + @file qurt_cycles.h + Prototypes of kernel pcycle API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ + +/**@ingroup func_qurt_profile_reset_idle_pcycles + @xreflabel{hdr:qurt_profile_reset_idle_pcycles} + Sets the per-hardware-thread idle cycle counts to zero. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_idle_pcycles (void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_thread_pcycles + @xreflabel{hdr:qurt_profile_get_thread_pcycles} + Gets the count of the running processor cycles for the current thread.\n + Returns the current running processor cycle count for the current QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @return + Integer -- Running processor cycle count for current thread. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_profile_get_thread_pcycles(void); + + +/*======================================================================*/ +/**@ingroup func_qurt_get_core_pcycles + @xreflabel{hdr:qurt_get_core_pcycles} + Gets the count of core processor cycles executed.\n + Returns the current number of running processor cycles executed since the Hexagon + processor was last reset. + + This value is based on the hardware core clock, which varies in speed according to the + processor clock frequency. + + @note1hang Because the hardware core clock stops running when the processor shuts + down (due to all of the hardware threads being idle), treat the cycle values returned + by this operation as relative rather than absolute. + + @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version. + + @return + Integer -- Current count of core processor cycles. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_get_core_pcycles(void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles + + @deprecated use #qurt_profile_get_idle_pcycles2 instead + + Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use + #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. + + This operation accepts a pointer to a user-defined array, and writes to the array the current + idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling is enabled or not, + and resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be a minimum of the number of hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_idle_pcycles (unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles2 + Gets the current idle processor cycle counts for maximum available hardware threads. + + This operation accepts a pointer to a user-defined array with length in bytes, and writes + to the array the current idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling enable status, and + resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be equivalent to the number of hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, + it returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles + + @deprecated use #qurt_profile_get_threadid_pcycles2 instead + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for a maximum of 6 hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Valid thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be a minimum of the number of + hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles2 + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for maximum available hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be equivalent to the number of + hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, it + returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long *pcycles, unsigned int length_in_bytes); + + +/*======================================================================*/ +/**@ingroup func_qurt_profile_reset_threadid_pcycles + @xreflabel{hdr:qurt_profile_reset_threadid_pcycles} + Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread. + + @param[in] thread_id Thread identifier. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_threadid_pcycles (int thread_id); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_enable + @xreflabel{hdr:qurt_profile_enable} + Enables profiling.\n + Enables or disables cycle counting of the running and idle processor cycles. + Profiling is disabled by default. \n + + @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be + done explicitly by calling the reset operations before starting cycle counting. + Cycle counting starts from the instant of it was enabled using this API, and + halts on profiling disable. + + @param[in] enable Profiling. Values: \n + - 0 -- Disable profiling \n + - 1 -- Enable profiling @tablebulletend + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_enable (int enable); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_pcycles + @xreflabel{hdr:qurt_get_hthread_pcycles} + Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values are 1 through . + + + @return + Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed + from reset to current point of execution when n threads are in run mode + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_pcycles(int n); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_commits + @xreflabel{hdr:qurt_get_hthread_commits} + Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values: 1 through . + + @return + Value read from the GCOMMIT_nT register. This value indicates the total number of packets + committed from reset to current point of execution when n threads are in run mode. + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_commits(int n); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_devtree.h new file mode 100755 index 0000000000000..4adee45bb44a2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_devtree.h @@ -0,0 +1,161 @@ +#ifndef QURT_DEVTREE_H +#define QURT_DEVTREE_H +/** + @file qurt_devtree.h + @brief Prototypes and structures for device tree aware QuRT library function. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +*/ +/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def. + callback is not used here, so define NULL here to avoid including the world*/ +#ifndef NULL +#define NULL ((void *) 0) +#endif + +#include "libfdt.h" +#include "DTBExtnLib.h" +#include "qurt_qdi_ext.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_BLOB_ID (-1) +#define DEFAULT_BLOB_ID 0 + +/** QURT Device Tree Mapping Macros */ +#define QURT_DT_MAPPING_FAILED (-1) +#define QURT_DT_FLAG_ISLAND 0x1 +#define QURT_DT_FLAG_PHYSADDR 0x2 + +/** Device Tree type for Root PD Device tree. +    Root PD Device Tree will typically describe the hardware in the subsystem. +    This is the /soc portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_ROOT 0 + +/** Device Tree type for Local Device tree. +    Local Device Tree will typically contain the software settings. +    This is the /sw portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_LOCAL 1 + +int qurt_devtree_init(void); + +/**@ingroup func_qurt_dt_mapping_create + Creates a memory mapping from the specified property of the specified device + tree node. Returns virtual addresses and sizes. + + @param[in] offset Device tree node offset. + @param[in] flags Flags to configure memory. Overloaded as property + index if reg_name is NULL. + @param[in] reg_name Identifies property to use for mapping, should + resemble a region. + @param[out] vaddr Return pointer for the virtual region address. + @param[out] size Return pointer for the virtual region size. + + @return + Result code indicating success or failure \n +*/ +int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, + unsigned long long *vaddr, unsigned long long *size); + +/**@ingroup func_qurt_dt_mapping_create2 + + Creates a memory mapping from the specified property of the specified device + tree node. + + Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). + + @param[in] devtreeNode Device Tree node + + @param[in] dt_map_flags Flags to configure memory mapping and are reserved for future purpose. + (0) - Default value assumes details from DT node are phys address, size. + QURT_DT_FLAG_ISLAND + + NOTE: The PA needs to be added to corresponding island spec to create an island mapping + + @param[in] regionName NULL or name of index in range to return, should + resemble a region. Ex.reg-names = "base", "rx", "tx"; + + @param[in] regionIdx Index of range to return. Ex reg = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >; + + NOTE: If client specifies both re_name & regionIdx. The precedence of + region name is taken over and region index is ignored. + + @param[in] dt_map_perm Mapping access permissions(R/W), + QURT_PERM_READ + QURT_PERM_WRITE + + @param[in] cache_attr QuRT cache mode type's : + QURT_MEM_CACHE_DEVICE + QURT_MEM_CACHE_WRITEBACK + Other required cache type enums in qurt_types.h can also be passed. + + NOTE: No default value for cache & perm is present. + Client always needs to pass any of defined the flags. + + @param[out] vaddr Return pointer to the variable that holds the virtual address + @param[out] size Return pointer for the virtual region size. + + @return + #QURT_EOK Success indicating mapping created properly. + #QURT_DT_MAPPING_FAILED Failed to create mapping. + #QURT_EINVALID Mismatch in the architecture. + + else FdtLib or thirdparty error code. + +*/ +int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, + char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size); + +/**@ingroup func_qurt_dt_isr_register + Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. + The interrupt defined in the specified device tree node is enabled when this function returns success. + + @datatypes + #qurt_thread_t \n + #fdt_node_handle + + @param[in] dt_node Device tree node that specifies the interrupt property. + @param[in] dt_int_index Index of the specific interrupt to use within the device tree node structure. + Specify either this or int_name, use -1 if string is used. + @param[in] dt_int_name Name of the specific interrupt to use within the device tree node structure. + Either this or int_index should be specified, use NULL if index is used + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2(). + @param[in] prio Priority of the ISR, defined by qurt_isr_register2(). + @param[in] flags Defines ACK type. Values : \n + #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the kernel. + #QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + Defined by qurt_isr_register2(). + @param[in] isr ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2(). + @param[in] arg First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2(). + + @return + #QURT_EOK -- Successfully registered the ISR for the interrupt \n + #QURT_EINT -- Interrupt not configured \n + #QURT_EINVALID -- Invalid thread ID \n + #QURT_EDISABLED -- The feature is disabled \n + #QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Create the thread ID qurt_isr_create(). + ISR registration completed with qurt_isr_register2(). + */ +int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, + unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_dt_blob_id_get + Returns the Blob ID for the Blob type passed. + The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs. + + @param[in] blob_type  Blob type to look up. + @return Blob ID for the passed Blob Type. +*/ +int qurt_dt_blob_id_get(unsigned int blob_type); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ecc.h new file mode 100755 index 0000000000000..09312684e99af --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ecc.h @@ -0,0 +1,168 @@ +#ifndef QURT_ECC_H +#define QURT_ECC_H + + +/*===================================================================== + + @file qurt_ecc.h + @brief Prototypes of QuRT memory ECC API functions + + Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup exception_handling_types +@{ */ +// ECC memory definition +typedef enum { + QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */ + QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/ + QURT_ECC_MEM_L2_CACHE = 2, /**< ECC memory L2 Cache.*/ + QURT_ECC_MEM_VTCM = 3 /**< ECC memory VTCM.*/ +} qurt_ecc_memory_t; +/** @} */ /* end_addtogroup exception_handling_types */ + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup exception_handling_macros +@{ */ + +#define QURT_ECC_ERR_DETECTED_STATUS 0 /**< ECC error detected. */ +#define QURT_ECC_ERR_TYPE 1 /**< ECC error type.*/ +// ECC status type + +#define QURT_ECC_CORRECTABLE_COUNT (1<<0) /**< ECC correctable count.*/ +#define QURT_ECC_UNCORRECTABLE_COUNT (1<<1) /**< ECC uncorrectable count.*/ +#define QURT_ECC_REGION_LOGGING (1<<2) /**< ECC region logging.*/ +// ECC enable/disable definition + +#define QURT_ECC_PROTECTION_DISABLE (0<<0) /**< Bit 0. */ +#define QURT_ECC_PROTECTION_ENABLE (1<<0) /**< Bit 0. */ +/** @} */ /* end_addtogroup exception_handling_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_ecc_enable + Enables or disables ECC protection on a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] enable Set to one of the following values: + - #QURT_ECC_PROTECTION_ENABLE + - #QURT_ECC_PROTECTION_DISABLE @tablebulletend + + @return + - #QURT_EOK -- ECC enabling or disabling setup is performed successfully + - Others -- Failure + + @dependencies + None. + */ +int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable ); + + +/**@ingroup func_qurt_ecc_get_error_status + Gets ECC error status for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following: + - #QURT_ECC_ERR_DETECTED_STATUS + - #QURT_ECC_ERR_TYPE @tablebulletend + + @return + Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS: + - 0 -- No error detected \n + - 1 -- At least one error detected \n + Returns the following when the type is #QURT_ECC_ERR_TYPE: \n + - 0 through 1 -- Correctable error \n + - 2 -- Uncorrectable error + + @dependencies + None. + */ +int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_get_error_count + Gets the ECC error count for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values:\n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT @tablebulletend + + @return + Error count for the specified error type. + + @dependencies + None. + */ +int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_clear_error_count + Clears ECC error count or region logging for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: \n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one or multiple OR'ed of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT \n + - #QURT_ECC_REGION_LOGGING @tablebulletend + + @return + #QURT_EOK -- Error count successfully cleared \n + Others -- Failure at clearing the error count + + @dependencies + None. + */ +int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type ); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ECC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_error.h new file mode 100755 index 0000000000000..f4666b396c378 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_error.h @@ -0,0 +1,149 @@ +#ifndef QURT_ERROR_H +#define QURT_ERROR_H + +/** + @file qurt_error.h + Error results- QURT defines a set of standard symbols for the error result values. This file lists the + symbols and their corresponding values. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021-2022 , 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ +#include "qurt_except.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup chapter_error +@{ */ + +/*===================================================================== +Constants and macros +======================================================================*/ +#define QURT_EOK 0 /**< Operation successfully performed. */ +#define QURT_EVAL 1 /**< Wrong values for the parameters. The specified page does not exist. */ +#define QURT_EMEM 2 /**< Not enough memory to perform the operation.*/ + +#define QURT_EINVALID 4 /**< Invalid argument value; invalid key. */ +/** @cond */ +#define QURT_EUNKNOWN 6 /**< Defined but never used in QuRT. */ +#define QURT_ENOMSGS 7 /**< Message queue is empty. */ +#define QURT_EBADF 9 /**< Bad message queue descriptor. */ +/** @endcond */ +#define QURT_EFAILED 12 /**< Operation failed. */ + +#define QURT_ENOTALLOWED 13 /**< Operation not allowed. */ + +/** @cond */ +#define QURT_EDUPCLSID 14 /*< Duplicate class ID. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOREGISTERED 20 /**< No registered interrupts.*/ +/** @endcond */ + + +/** @cond */ +#define QURT_EISDB 21 /*< Power collapse failed due to ISDB being enabled. */ +#define QURT_ESTM 22 /*< Power collapse failed in a Single-threaded mode check. */ +/** @endcond */ + + +/** @cond rest_reg_dist */ +#define QURT_ETLSAVAIL 23 /**< No free TLS key is available. */ +#define QURT_ETLSENTRY 24 /**< TLS key is not already free. */ +/** @endcond */ + +#define QURT_EINT 26 /**< Invalid interrupt number (not registered). */ +/** @cond rest_reg_dist */ +#define QURT_ESIG 27 /**< Invalid signal bitmask (cannot set more than one signal at a time). */ +/** @endcond */ + +/** @cond */ +#define QURT_EHEAP 28 /**< No heap space is available. */ +#define QURT_ENOSPC 28 /**< No space to create another queue in the system. */ +#define QURT_EMEMMAP 29 /**< Physical address layout is not supported by the kernel. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOTHREAD 30 /**< Thread no longer exists. */ +/** @endcond */ +/** @cond */ +#define QURT_EL2CACHE 31 /**< L2cachable is not supported in kernel invalidate/cleaninv. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_EALIGN 32 /**< Not aligned. */ +#define QURT_EDEREGISTERED 33 /**< Interrupt is already deregistered.*/ +/** @endcond */ + +/** @cond internal_only */ + +#define QURT_ETLBCREATESIZE 34 /**< TLB create error -- Incorrect size.*/ +#define QURT_ETLBCREATEUNALIGNED 35 /**< TLB create error -- Unaligned address.*/ +/** @endcond */ +/** @cond rest_reg_dist*/ +#define QURT_EEXISTS 35 /**< File or message queue already exists. */ +#define QURT_ENAMETOOLONG 36 /**< Name too long for message queue creation. */ +#define QURT_EPRIVILEGE 36 /**< Caller does not have privilege for this operation.*/ + +#define QURT_ECANCEL 37 /**< A cancellable request was canceled because the associated process was asked to exit.*/ +/** @endcond */ + +/** @cond */ +#define QURT_EISLANDTRAP 38 /*< Unsupported TRAP is called in Island mode.*/ + +#define QURT_ERMUTEXUNLOCKNONHOLDER 39 /*< Rmutex unlock by a non-holder.*/ +#define QURT_ERMUTEXUNLOCKFATAL 40 /*< Rmutex unlock error, all except the non-holder error.*/ +#define QURT_EMUTEXUNLOCKNONHOLDER 41 /*< Mutex unlock by a non-holder.*/ +#define QURT_EMUTEXUNLOCKFATAL 42 /*< Mutex unlock error, all except the non-holder error.*/ +#define QURT_EINVALIDPOWERCOLLAPSE 43 /*< Invalid power collapse mode requested. */ +/** @endcond */ +#define QURT_EISLANDUSEREXIT 44 /**< User call has resulted in island exit.*/ +#define QURT_ENOISLANDENTRY 45 /**< Island mode had not yet been entered.*/ +#define QURT_EISLANDINVALIDINT 46 /**< Exited Island mode due to an invalid island interrupt.*/ +/** @cond rest_reg_dist */ +#define QURT_ETIMEDOUT 47 /**< Operation timed-out. */ +#define QURT_EALREADY 48 /**< Operation already in progress. */ +/** @endcond */ + +#define QURT_ERETRY 49 /*< Retry the operation. */ +#define QURT_EDISABLED 50 /*< Resource disabled. */ +#define QURT_EDUPLICATE 51 /*< Duplicate resource. */ +#define QURT_EBADR 53 /*< Invalid request descriptor. */ +#define QURT_ETLB 54 /*< Exceeded maximum allowed TLBs. */ +#define QURT_ENOTSUPPORTED 55 /*< Operation not supported. */ +/** @cond rest_reg_dist */ +#define QURT_ENORESOURCE 56 /**< No resource. */ +/** @endcond */ + +#define QURT_EDTINIT 57 /**< Problem with device tree intialization. */ +#define QURT_EBUFLOCK 58 /*< Buffer lock failed because it was already locked many times. */ +#define QURT_ELOCKED 59 /**< Current operation failed as the buffer is locked. */ +#define QURT_EMSGSIZE 90 /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */ + + +#define QURT_ENOTCONFIGURED 91 /*< Interrupt is NOT configured. */ + +#define QURT_EBANDWIDTHLIMIT 92 /*< Message queue send exceed the bandwidth limit. */ + +#define QURT_ECFIVIOLATION 93 /*< CFI violation detected. */ + +#define QURT_EDESTROY 94 /**< A destroy request was made to waiting threads.*/ + +#define QURT_EHMXNOTAVAIL 95 /**< HMX is not available to target thread.*/ +#define QURT_EHMXNOTDETACHABLE 96 /**< HMX is not detachable from target thread.*/ + +#define QURT_EFATAL -1 /**< Fatal error. */ + +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ERROR_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_event.h new file mode 100755 index 0000000000000..987f0fe79f227 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_event.h @@ -0,0 +1,452 @@ +#ifndef QURT_EVENT_H +#define QURT_EVENT_H +/** + @file qurt_event.h + @brief Prototypes of kernel event API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "qurt_consts.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * System environment object type. + */ +/**@addtogroup sys_env_types +@{ */ +/** QuRT swap pool information type. */ +typedef struct qurt_sysenv_swap_pools { + /** @cond */ + unsigned int spoolsize; /* Swap pool size.*/ + unsigned int spooladdr; /* Swap pool start address.*/ + /** @endcond */ +}qurt_sysenv_swap_pools_t; + +/**QuRT application heap information type. */ +typedef struct qurt_sysenv_app_heap { + /** @cond */ + unsigned int heap_base; /* Heap base address.*/ + unsigned int heap_limit; /* Heap end address.*/ + /** @endcond */ +} qurt_sysenv_app_heap_t ; + +/** QuRT architecture version information type. */ +typedef struct qurt_sysenv_arch_version { + /** @cond */ + unsigned int arch_version; /*Architecture version.*/ + /** @endcond */ +}qurt_arch_version_t; + +/** QuRT maximum hardware threads information type. */ +typedef struct qurt_sysenv_max_hthreads { + /** @cond */ + unsigned int max_hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_max_hthreads_t; + +/** QuRT active hardware threads information type. */ +typedef struct qurt_sysenv_hthreads { + /** @cond */ + unsigned int hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_hthreads_t; + +/** QuRT maximum pi priority information type. */ +typedef struct qurt_sysenv_max_pi_prio { + /** @cond */ + unsigned int max_pi_prio; /*Maximum pi priority.*/ + /** @endcond */ +}qurt_sysenv_max_pi_prio_t; + +/** QuRT process name information type. */ +typedef struct qurt_sysenv_procname { + /** @cond */ + union { + unsigned int asid; /*Address space ID.*/ + unsigned int pid; /*Process ID.*/ + }; + char name[QURT_MAX_NAME_LEN]; /* Process name.*/ + /** @endcond */ +}qurt_sysenv_procname_t; + +/** QuRT stack profile count information type. */ +typedef struct qurt_sysenv_stack_profile_count { + /** @cond */ + unsigned int count; /*Stack profile count for usage.*/ + unsigned int count_watermark; /*Stack profile count for watermark.*/ + /** @endcond */ +}qurt_sysenv_stack_profile_count_t; + +/** + QuRT system error event type. + */ +typedef struct _qurt_sysevent_error_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + } qurt_sysevent_error_t ; + +typedef struct _qurt_sysevent_error_1_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + unsigned int fkey; /**< Framekey.*/ + unsigned int reserved1; /**< Reserved.*/ + unsigned int reserved2; /**< Reserved.*/ + unsigned int reserved3; /**< Reserved.*/ + } qurt_sysevent_error_1_t ; + +/** QuRT page fault error event information type. */ +typedef struct qurt_sysevent_pagefault { + qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */ + unsigned int fault_addr; /**< Accessed address that caused the page fault. */ + unsigned int ssr_cause; /**< SSR cause code for the page fault. */ +} qurt_sysevent_pagefault_t ; +/** @} */ /* @endaddtogroup sys_env_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/*======================================================================*/ +/** + Gets the environment swap pool 0 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools ); + +/* + Gets the environment swap pool 1 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools ); + +/**@ingroup func_qurt_sysenv_get_app_heap + Gets information on the program heap from the kernel. + + @datatypes + #qurt_sysenv_app_heap_t + + @param[out] aheap Pointer to information on the program heap. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap ); + +/**@ingroup func_qurt_sysenv_get_arch_version + Gets the Hexagon processor architecture version from the kernel. + + @datatypes + #qurt_arch_version_t + + @param[out] vers Pointer to the Hexagon processor architecture version. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter + + @dependencies + None. +*/ +int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers); + +/**@ingroup func_qurt_sysenv_get_max_hw_threads + Gets the maximum number of hardware threads supported in the Hexagon processor. + The API includes the disabled hardware threads to reflect the maximum + hardware thread count. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, only HW0 and HW2 are initialized by QuRT. + HW1 and HW3 are not used at all. Under such a scenario, + qurt_sysenv_get_max_hw_threads() still returns four. + + @datatypes + #qurt_sysenv_max_hthreads_t + + @param[out] mhwt Pointer to the maximum number of hardware threads supported in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_hw_threads + Gets the number of hardware threads initialized by QuRT in Hexagon processor. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, QuRT only initializes HW0 and HW2. + HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2. + + @datatypes + #qurt_sysenv_hthreads_t + + @param[out] mhwt Pointer to the number of hardware threads active in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_max_pi_prio + Gets the maximum priority inheritance mutex priority from the kernel. + + @datatypes + #qurt_sysenv_max_pi_prio_t + + @param[out] mpip Pointer to the maximum priority inheritance mutex priority. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip ); + +/**@ingroup func_qurt_sysenv_get_process_name2 + Gets information on the system environment process names based on the client_handle argument. + + @datatypes + #qurt_sysenv_procname_t + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_process_name + Gets information on the system environment process names from the kernel. + + @datatypes + #qurt_sysenv_procname_t + + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_stack_profile_count + Gets information on the stack profile count from the kernel. + + @datatypes + #qurt_sysenv_stack_profile_count_t + + @param[out] count Pointer to information on the stack profile count. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count ); + +/**@ingroup func_qurt_exception_wait + Registers the program exception handler. + This function assigns the current thread as the QuRT program exception handler and suspends the + thread until a program exception occurs. + + When a program exception occurs, the thread is awakened with error information + assigned to the parameters of this operation. + + @note1hang If no program exception handler is registered, or if the registered handler + calls exit, QuRT raises a kernel exception. + If a thread runs in Supervisor mode, any errors are treated as kernel + exceptions. + + @param[out] ip Pointer to the instruction memory address where the exception occurred. + @param[out] sp Stack pointer. + @param[out] badva Pointer to the virtual data address where the exception occurred. + @param[out] cause Pointer to the QuRT error result code. + + @return + Registry status: \n + Thread identifier -- Handler successfully registered. \n + #QURT_EFATAL -- Registration failed. + + @dependencies + None. +*/ +unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp, + unsigned int *badva, unsigned int *cause); + +unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err); + +/**@ingroup func_qurt_exception_wait3 + Registers the current thread as the QuRT program exception handler, and suspends the thread until a + program exception occurs. + When a program exception occurs, the thread is awakened with error information assigned to the specified + error event record. + If a program exception is raised when no handler is registered (or when a handler is registered, but it calls + exit), the exception is treated as fatal.\n + @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n + @note1cont This function differs from qurt_exception_wait() by returning the error information in a data + structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR). + + @param[out] sys_err Pointer to the qurt_sysevent_error_1_t type structure. + @param[in] sys_err_size Size of the qurt_sysevent_error_1_t structure. + + @return + Registry status: \n + - #QURT_EFATAL -- Failure. \n + - Thread ID -- Success. + + @dependencies + None. +*/ + +unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size); + +/**@ingroup func_qurt_exception_raise_nonfatal + Raises a nonfatal program exception in the QuRT program system. + + For more information on program exceptions, see Section @xref{dox:exception_handling}. + + This operation never returns -- the program exception handler is assumed to perform all + exception handling before terminating or reloading the QuRT program system. + + @note1hang The C library function abort() calls this operation to indicate software + errors. + + @param[in] error QuRT error result code (Section @xref{dox:error_results}). + + @return + Integer -- Unused. + + @dependencies + None. +*/ +int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn)); + + +/**@ingroup func_qurt_exception_raise_fatal + Raises a fatal program exception in the QuRT system. + + Fatal program exceptions terminate the execution of the QuRT system without invoking + the program exception handler. + + For more information on fatal program exceptions, see Section @xref{dox:exception_handling}. + + This operation always returns, so the calling program can perform the necessary shutdown + operations (data logging, on so on). + + @note1hang Context switches do not work after this operation has been called. + + @return + None. + + @dependencies + None. +*/ +void qurt_exception_raise_fatal (void); + +unsigned int qurt_enable_floating_point_exception(unsigned int mask); + +/**@ingroup func_qurt_exception_enable_fp_exceptions + Enables the specified floating point exceptions as QuRT program exceptions. + + The exceptions are enabled by setting the corresponding bits in the Hexagon + control user status register (USR). + + The mask argument specifies a mask value identifying the individual floating + point exceptions to set. The exceptions are represented as defined symbols + that map into bits 0 through 31 of the 32-bit flag value. + Multiple floating point exceptions are specified by OR'ing together the individual + exception symbols.\n + @note1hang This function must be called before performing any floating point operations. + + @param[in] mask Floating point exception types. Values: \n + - #QURT_FP_EXCEPTION_ALL \n + - #QURT_FP_EXCEPTION_INEXACT \n + - #QURT_FP_EXCEPTION_UNDERFLOW \n + - #QURT_FP_EXCEPTION_OVERFLOW \n + - #QURT_FP_EXCEPTION_DIVIDE0 \n + - #QURT_FP_EXCEPTION_INVALID @tablebulletend + + @return + Updated contents of the USR. + + @dependencies + None. +*/ + +static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask) +{ + return qurt_enable_floating_point_exception(mask); +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EVENT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_except.h new file mode 100755 index 0000000000000..e1684c80e3d50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_except.h @@ -0,0 +1,185 @@ +#ifndef QURT_EXCEPT_H +#define QURT_EXCEPT_H + +/** + @file qurt_except.h + @brief Defines Cause and Cause2 codes for error-handling. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. + + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + QuRT supports error handling to handle CPU detected exceptions and software errors. + QuRT treats all errors as either fatal errors or nonfatal errors. + + @section sec1 Fatal errors + All supervisor mode exceptions are treated as fatal errors. + If a registered exception handler calls qurt_exit(), it is treated as a fatal error. + Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. + All hardware threads are eventually stopped and the cache is flushed. + NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n + + @subsection subsection1 Debugging fatal errors + - QURT_error_info.status.status -- Indicates that an error occured. + - QURT_error_info.status.cause -- Cause code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.cause2 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.fatal -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered. + - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error. + - QURT_error_info.global_regs -- Contains the values of the global registers of Q6 + - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error. + + + + @subsection subsection2 Debugging nonfatal errors + - QURT_error_info.user_errors -- All user errors are logged here. + - QURT_error_info.user_errors.counter -- Index to last logged error. + - QURT_error_info.user_errors.entry[0...counter] -- Structure for logged error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb -- TCB for the user error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID. + - QURT_error_info.user_errors.entry[0...counter].error_code -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below. + - QURT_error_info.user_errors.entry[0...counter].hw_thread -- Hardware thread ID for error. + - QURT_error_info.user_errors.entry[0...counter].pcycle -- Pcycle for error. + +@note + Important usage note: + Cause and Cause2 are error codes to distinguish multiple errors. + SSR and BADAVA are inconclusive without the vector number. + All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code. + Hence the system can have up to 255 * 255 unique error codes. + The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) ) + Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes. + SSR cause codes are defined in Hexagon reference manual. + All possible combinations are listed below. +*/ +/** @addtogroup chapter_error +@{ */ +/* cause - error type - 8-bits*/ +#define QURT_EXCEPT_PRECISE 0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/ +#define QURT_EXCEPT_NMI 0x02U /**< NMI occurred; Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS 0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_RSVD_VECTOR 0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */ +#define QURT_EXCEPT_ASSERT 0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below. */ +#define QURT_EXCEPT_BADTRAP 0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */ +#define QURT_EXCEPT_UNDEF_TRAP1 0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */ +#define QURT_EXCEPT_EXIT 0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */ +#define QURT_EXCEPT_TLBMISS_X 0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */ +#define QURT_EXCEPT_STOPPED 0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */ +#define QURT_EXCEPT_FATAL_EXIT 0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */ +#define QURT_EXCEPT_INVALID_INT 0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */ +#define QURT_EXCEPT_FLOATING_POINT 0x0EU /**< Kernel received an floating point error. Cause2 is not defined. */ +#define QURT_EXCEPT_DBG_SINGLE_STEP 0x0FU /**< Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS_RW_ISLAND 0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */ +#define QURT_EXCEPT_TLBMISS_X_ISLAND 0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_SYNTHETIC_FAULT 0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */ +#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */ +#define QURT_EXCEPT_UNDEF_TRAP0 0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */ +#define QURT_EXCEPT_PRECISE_DMA_ERROR 0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */ + +#define QURT_ECODE_UPPER_LIBC (0U << 16) /**< Upper 16 bits is 0 for libc. */ +#define QURT_ECODE_UPPER_QURT (0U << 16) /**< Upper 16 bits is 0 for QuRT. */ +#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16) /**< Upper 16 bits is 2 for error service. */ +/** @cond */ +#define QURT_ECODE_ISLAND_INVALID_QDI 3U /**< Passing invalid QDI method in island. */ +/** @endcond */ + +/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */ +#define QURT_SYNTH_ERR 0x01U /**< */ +#define QURT_SYNTH_INVALID_OP 0x02U /**< */ +#define QURT_SYNTH_DATA_ALIGNMENT_FAULT 0x03U /**< */ +#define QURT_SYNTH_FUTEX_INUSE 0x04U /**< */ +#define QURT_SYNTH_FUTEX_BOGUS 0x05U /**< */ +#define QURT_SYNTH_FUTEX_ISLAND 0x06U /**< */ +#define QURT_SYNTH_FUTEX_DESTROYED 0x07U /**< */ +#define QURT_SYNTH_PRIVILEGE_ERR 0x08U /**< */ + +/* Cause2 - Abort cause reason - 8 bits */ +/* ERR_ASSERT cause */ +#define QURT_ABORT_FUTEX_WAKE_MULTIPLE 0x01U /**< Abort cause - futex wake multiple. */ +#define QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE 0x02U /**< Abort cause - thread waiting to wake up in Single Threaded mode. */ +#define QURT_ABORT_TCXO_SHUTDOWN_NOEXIT 0x03U /**< Abort cause - call TCXO shutdown without exit. */ +#define QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL 0x04U /**< Abort cause - futex allocation queue failure - QURTK_futexhash_lifo empty. */ +#define QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT 0x05U /**< Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */ +#define QURT_ABORT_THREAD_SCHEDULE_SANITY 0x06U /**< Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */ +#define QURT_ABORT_REMAP 0x07U /**< Remap in the page table; the correct behavior must remove mapping if necessary. */ +#define QURT_ABORT_NOMAP 0x08U /**< No mapping in page table when removing a user mapping. */ +#define QURT_ABORT_OUT_OF_SPACES 0x09U +#define QURT_ABORT_INVALID_MEM_MAPPING_TYPE 0x0AU /**< Invalid memory mapping type when creating qmemory. */ +#define QURT_ABORT_NOPOOL 0x0BU /**< No pool available to attach. */ +#define QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM 0x0CU /**< Cannot allocate more futex waiting queue. */ +#define QURT_ABORT_ARG_ERROR 0x0DU +#define QURT_ABORT_ASSERT 0x0EU /**< Assert abort. */ +#define QURT_ABORT_FATAL 0x0FU /**< Fatal error; must never occur. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE 0x10U /**< Abort cause - invalid queue ID in futex resume. */ +#define QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE 0x11U /**< Abort cause - invalid queue ID in futex wait. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX 0x12U /**< Abort cause - invalid futex object in hashtable. */ +#define QURT_ABORT_NO_ERHNDLR 0x13U /**< No registered error handler. */ +#define QURT_ABORT_ERR_REAPER 0x14U /**< Exception in the reaper thread. */ +#define QURT_ABORT_FREEZE_UNKNOWN_CAUSE 0x15U /**< Abort in thread freeze operation. */ +#define QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE 0x16U /**< During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */ +#define QURT_ABORT_ERR_ISLAND_EXP_HANDLER 0x17U /**< Exception in Island exception handler task. */ +#define QURT_ABORT_L2_TAG_DATA_CHECK_FAIL 0x18U /**< Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */ +#define QURT_ABORT_ERR_SECURE_PROCESS 0x19U /**< Abort error in secure process. */ +#define QURT_ABORT_ERR_EXP_HANDLER 0x20U /**< No exception handler, or the handler caused an exception. */ +#define QURT_ABORT_ERR_NO_PCB 0x21U /**< PCB of the thread context failed initialization, PCB was NULL. */ +#define QURT_ABORT_NO_PHYS_ADDR 0x22U /**< Unable to find the physical address for the virtual address. */ +#define QURT_ABORT_OUT_OF_FASTINT_CONTEXTS 0x23U /**< Fast interrupt contexts exhausted. */ +#define QURT_ABORT_CLADE_ERR 0x24U /**< Fatal error seen with CLADE interrupt. */ +#define QURT_ABORT_ETM_ERR 0x25U /**< Fatal error seen with ETM interrupt. */ +#define QURT_ABORT_ECC_DED_ASSERT 0x26U /**< ECC two-bit DED error. */ +#define QURT_ABORT_VTLB_ERR 0x27U /**< Fatal error in the VTLB layer. */ +#define QURT_ABORT_TLB_ENCODE_DECODE_FAILURE 0x28U /**< Failure during the TLB encode or decode operation. */ +#define QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE 0x29U /**< Failure to lookup entry in the page table. */ +#define QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE 0x30U /**< Failure to claim phy memory ownership. */ +#define QURT_ABORT_JTLB_SIZE_CHECK_FAIL 0x31U /**< JTLB size configured is more than actual size in hardware */ +#define QURT_ABORT_AUTOSTACK_ASSERT 0x32U /**< Error while handling stack flimit exception. */ + +/* Cause2 - TLB-miss_X - 8bits */ +#define QURT_TLB_MISS_X_FETCH_PC_PAGE 0x60U /**< */ +#define QURT_TLB_MISS_X_2ND_PAGE 0x61U /**< */ +#define QURT_TLB_MISS_X_ICINVA 0x62U /**< */ + +/* Cause2 - TLB-miss_RW - 8bits */ +#define QURT_TLB_MISS_RW_MEM_READ 0x70U /**< */ +#define QURT_TLB_MISS_RW_MEM_WRITE 0x71U /**< */ + +/** @cond rest_reg_dist */ +/* Cause2 - Floating point exception - 8 bits */ +#define QURT_FLOATING_POINT_EXEC_ERR 0xBFU /**< Execute floating-point. */ +/** @endcond */ + +/** Cause2 - autostackv2 - 8 bits */ +#define QURT_AUTOSTACKV2_CANARY_NOT_MATCH 0xC1U +#define QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE 0xC2U + +/** Cause2 - CFI violation - 8 bits */ +#define QURT_CFI_VIOLATION 0xC3U + +/** @cond rest_reg_dist*/ +/* Enable floating point exceptions */ +#define QURT_FP_EXCEPTION_ALL 0x1FU << 25 /**< */ +#define QURT_FP_EXCEPTION_INEXACT 0x1U << 29 /**< */ +#define QURT_FP_EXCEPTION_UNDERFLOW 0x1U << 28 /**< */ +#define QURT_FP_EXCEPTION_OVERFLOW 0x1U << 27 /**< */ +#define QURT_FP_EXCEPTION_DIVIDE0 0x1U << 26 /**< */ +#define QURT_FP_EXCEPTION_INVALID 0x1U << 25 /**< */ + +/** @endcond */ +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EXCEPT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fastint.h new file mode 100755 index 0000000000000..ea65dc0917fc0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fastint.h @@ -0,0 +1,71 @@ +#ifndef QURT_FASTINT_H +#define QURT_FASTINT_H + +/** + @file qurt_fastint.h + @brief QuRT fast interrupt functions + + Copyright (c) 2013-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_fastint_register + Register fast interrupt callback function + + Fast interrupt callback should be designed to perform the minimal necessary + actions for the interrupt, and/or perform some operations, such as signaling + another regular software thread to start any additional processing. + The callback should be a fast and short function. When a fast interrupt callback + is running, the corresponding interrupt cannot be re-enabled until the callback + returns. + + The fast interrupt callback must not use any system blocking calls, such as + mutex lock or signal wait. Otherwise, it results in errors. + + The fast interrupt callback function has a single integer argument and the + function ends with no return. The argument value passed in is the interrupt + number, and therefore a single callback function can handle + multiple fast interrupts. + + @param[in] intno Interrupt number to register. + @param[in] fn Interrupt callback function. + + @return + #QURT_EOK -- Fast interrupt registration is successful. \n + #QURT_EINVALID -- Interrupt is already registered. \n + #QURT_EINT -- Invalid interrupt number. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_register(int intno, void (*fn)(int)); + + +/*======================================================================*/ +/**@ingroup func_qurt_fastint_deregister + Deregisters the fast interrupt callback function. + + @param[in] intno Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 + (simulator only). + + @return + #QURT_EOK -- Interrupt deregistration is successful. \n + #QURT_EINT -- Invalid interrupt number (not registered). \n + #QURT_EINVALID -- Invalid interrupt number (already deregistered). + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_deregister(int intno); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FASTINT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fs_hub.h new file mode 100755 index 0000000000000..aaa050a6c838b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fs_hub.h @@ -0,0 +1,58 @@ +#ifndef QURT_FS_HUB_H +#define QURT_FS_HUB_H + +/** + @file qurt_fs_hub.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver that provides file-system functionality. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + This structure tracks a file-designator for a FS-hub QDI driver. + File system's QDI interface should use this object to encapsulate + true file-descriptor and return back a QDI handle. This QDI handle + will be used as file-descriptor by File-systm-hub. + */ + +typedef struct qurt_qdi_fs_obj +{ + qurt_qdi_obj_t qdi_obj; + int client_handle; + int fd; +}qurt_qdi_fs_obj_t; + + +/**@ingroup fs_hub_support_functions + This function allows a file-system to register it's QDI interface with file-system-hub. + Once registered, all file open operations for any filenames containing the mountpoint will + be forwarded to the QDI inteface. + + Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/" + + @param mtpoint mount point for the file-system being registered. + @param opener opener structure for the QDI driver interface + + @return + QURT_EOK -- Successfully registered QDI driver with file-system-hub. + Negative error code -- Failed to register with file-system-hub + */ +int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_futex.h new file mode 100755 index 0000000000000..1fdcc79a43f01 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_futex.h @@ -0,0 +1,82 @@ +#ifndef QURT_FUTEX_H +#define QURT_FUTEX_H +/** + @file qurt_futex.h + + @brief Prototypes of QuRT futex API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Functions +======================================================================*/ + + +/**@ingroup func_qurt_futex_wait + Moves the caller thread into waiting state when a memory object address + contains a value that is the same as a specified value. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait(void *lock, int val); + + +/**@ingroup func_qurt_futex_wait_cancellable + If a memory object address contains a value that is same as a specified + value, move the caller thread into waiting state. + The kernal can cancel the waiting state when there is a special need. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait_cancellable(void *lock, int val); + + +/**@ingroup func_qurt_futex_wake + Wakes up a specified number of threads that have been waiting + for the object change with qurt_futex_wait(). + + @param[in] lock Pointer to the object memory. + @param[in] n_to_wake Maximum number of threads to wake up. + + @return + number of threads to be woken up by this function + + @dependencies + None. + */ +int qurt_futex_wake(void *lock, int n_to_wake); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hmx.h new file mode 100755 index 0000000000000..e4037dbeae514 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hmx.h @@ -0,0 +1,226 @@ +#ifndef QURT_HMX_H +#define QURT_HMX_H +/** + @file qurt_hmx.h + @brief Prototypes of Qurt HMX API. + +Copyright (c) 2019-2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + + +/** @addtogroup hmx_types +@{ */ +/* HMX locking type */ +#define QURT_HMX_NON_SHARED_LOCK 0U /**< HMX locking type.*/ +#define QURT_HMX_SHARED_LOCK 1U /**< HMX locking type.*/ + +/* HMX unlocking type */ +#define QURT_HMX_NON_SHARED_UNLOCK 0U /**< HMX unlocking type.*/ +#define QURT_HMX_SHARED_UNLOCK 1U /**< HMX unlocking type.*/ + +/* HMX hardware context */ +#define QURT_HMX_UNIT_0 0U /**< HMX hardware context #0 */ +#define QURT_HMX_UNIT_1 1U /**< HMX hardware context #1 */ + /** @} */ /* end_addtogroup hmx_types */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_hmx_lock2 + Locks a HMX unit with the specified locking type. + + #QURT_HMX_NON_SHARED_LOCK: + - If a HMX unit is available, lock the unit and return success of #QURT_EOK. + - If the HMX unit is already locked by another thread, the caller thread is suspended + until the HMX is available and gets locked by this function. + - If there is no HMX hardware supported, returns #QURT_EVAL; + + #QURT_HMX_SHARED_LOCK: + - If a HMX unit is available, enables HMX access for the caller thread, and returns + success of #QURT_EOK. + - If the HMX is enabled on the caller thread, return #QURT_EFAILED. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller + thread, and return success of #QURT_EOK. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED. + - If the HMX is locked by a thread from another user process different from the + user process of the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX lock successful.\n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_lock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_unlock2 + Unlocks a HMX unit with the unlocking type. + + #QURT_HMX_NON_SHARED_UNLOCK: + - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the + HMX accumulators (assuming a fixed point type). + - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + #QURT_HMX_SHARED_UNLOCK: + - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the + HMX access on the caller thread, and return success of #QURT_EOK. + Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK + in its user process, the unlock function clears the HMX accumulators. + - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return + failure of #QURT_EFAILED. + - If the caller thread has not locked HMX, return failure of #QURT_EFAILED. + - If there is no HMX hardware supported, returns #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX is unlocked successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_unlock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_lock + Locks a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away. + If there is no HMX unit available, the caller is blocked until a HMX is available + and is locked by the function. + + @return + #QURT_EOK -- HMX lock successful. \n + #QURT_EFAILED -- Failure due to wrong locking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_lock(void); + + +/**@ingroup func_qurt_hmx_unlock + Unlocks a HMX unit. + If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its + accumulators(assuming fixed point type). + If there is no HMX unit locked by the caller thread, return failure. + + @return + #QURT_EOK -- HMX unlock successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_unlock(void); + + +/**@ingroup func_qurt_hmx_try_lock + Tries to lock a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away; + if there is no HMX unit available, the function returns failure without blocking the caller. + + @return + #QURT_EOK -- HMX lock successful \n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_try_lock(void); + + +/**@ingroup func_qurt_hmx_assign + Assign a HMX unit to a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, + kernel will detach it from the thread, and re-assign it to the target thread. + If the target thread has HVX enabled, it cannot have HMX enabled. + + Locking type + #QURT_HMX_NON_SHARED_LOCK: + - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK. + - If the HMX unit is already enabled on the target thread, return #QURT_EOK. + - If the HMX unit is already locked by another thread, detach the HMX from the thread. + Re-assign the HMX unit to the target thread, and return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] type Locking type + #QURT_HMX_NON_SHARED_LOCK -- non-shared lock + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is assigned successfully. This includes the case that \n + the target thread already has HMX assigned. \n + #QURT_EFAILED -- Failure due to wrong assigning conditions. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit ); + + +/**@ingroup func_qurt_hmx_release + Release a HMX unit from a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + + Qurt detaches the specified HMX unit from the target thread, and return success of + #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is released successfully. This includes the case that \n + the target thread already has the HMX released. \n + #QURT_EFAILED -- Failure due to wrong assigning condition. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit ); + + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HMX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hvx.h new file mode 100755 index 0000000000000..13c213d49ac84 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hvx.h @@ -0,0 +1,421 @@ +#ifndef QURT_HVX_H +#define QURT_HVX_H +/** + @file qurt_hvx.h + @brief Prototypes of QuRT HVX API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @cond */ + +typedef enum { + QURT_HVX_MODE_64B = 0, /**< HVX mode of 64 bytes */ + QURT_HVX_MODE_128B = 1 /**< HVX mode of 128 bytes */ +} qurt_hvx_mode_t; +/** @endcond */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @cond internal_only*/ +/** @addtogroup hvx_macros +@{ */ +#define QURT_HVX_HW_UNITS_2X128B_4X64B 0x00000204 /**< Bits 15 through 8 are for the number of 128B units. */ + /**< Bits 7 through 0 are for the number of 64B units. */ +#define QURT_HVX_HW_UNITS_4X128B_0X64B 0x00000400 +#define QURT_HVX_HW_UNITS_6X128B_0X64B 0x00000600 + +/* HVX locking status */ + +#define QURT_HVX_UNLOCKED (0) /* Has not locked HVX unit */ +#define QURT_HVX_LOCKED (1) /* Has locked HVX unit */ +#define QURT_HVX_ERROR (-1) /* Error, no HVX support */ + +/* Input value for HVX reservation */ + +#define QURT_HVX_RESERVE_ALL (4) /* All the HVX units in terms of 64B_MODE are requested to be reserved */ +#define QURT_HVX_RESERVE_ALL_AVAILABLE (0xff) /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */ + +/* Return values for HVX reservation */ + +#define QURT_HVX_RESERVE_NOT_SUPPORTED (-1) /* There is no HVX hardware, or less units in the hardware than requested */ +#define QURT_HVX_RESERVE_NOT_SUCCESSFUL (-2) /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */ +#define QURT_HVX_RESERVE_ALREADY_MADE (-3) /* There is already a HVX reservation made. */ +#define QURT_HVX_RESERVE_CANCEL_ERR (-4) /* The action of cancling the reservation fails because this protection domain has no reservation made before. */ + +// HVX set requests + +#define QURT_HVX_64B 0 /**< */ +#define QURT_HVX_128B 1 /**< */ +#define QURT_HVX_NO_USE 2 /**< */ +#define QURT_HVX_RELEASE_CONTEXT 3 /**< */ +#define QURT_HVX_IMMEDIATE_USE 4 /**< */ + +// HVX set masks + +#define QURT_HVX_64B_PREFERRED (1<<(QURT_HVX_64B + 8))/**< */ +#define QURT_HVX_128B_PREFERRED (1<<(QURT_HVX_128B + 8))/**< */ +#define QURT_HVX_64B_ACCEPTABLE (1<<(QURT_HVX_64B + 12))/**< */ +#define QURT_HVX_128B_ACCEPTABLE (1<<(QURT_HVX_128B + 12))/**< */ + +// HVX set return "result" + +#define QURT_EOK 0 /**< */ +#define QURT_HVX_SET_ERROR 0xFF /**< */ + +// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE +#define QURT_HVX_64B_ASSIGNED (1<<(QURT_HVX_64B + 8)) /**< */ +#define QURT_HVX_128B_ASSIGNED (1<<(QURT_HVX_128B + 8)) /**< */ + +// Sizes of HVX dump buffer + +#define QURT_HVX_V65_64B_VSIZE 2084U /**< 64 x 32 + 8 x 4 + 4 (version). */ +#define QURT_HVX_V65_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V66_128B_VSIZE 4420U /**< 128 x (32 +2) + 16 x 4 + 4 (version). */ +#define QURT_HVX_V68_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V79_128B_VSIZE 4740U /**< 128 x (32+4+1) + 4 (version). */ +#define QURT_HVX_VREG_BUF_SIZE QURT_HVX_V79_128B_VSIZE /**< */ + +// HVX dump versions + +#define QURT_HVX_DUMP_V65_64B 1U /**< */ +#define QURT_HVX_DUMP_V65_128B 2U /**< */ +#define QURT_HVX_DUMP_V66_128B 3U /**< */ +#define QURT_HVX_DUMP_V68_128B 4U /**< */ +#define QURT_HVX_DUMP_V79_128B 5U /**< */ +/** @} */ /* end_addtogroup hvx_macros */ +/** @endcond */ +/** @cond */ +// Qurt data struct for hvx_set input +typedef struct qurt_hvx_set_struct_ { + unsigned char set_req; // LSB + struct { + unsigned char preferred_mask:4; + unsigned char acceptable_mask:4; + }; + unsigned short resvd; // MSB +} qurt_hvx_set_struct_t; // 4 bytes + + +// Qurt data struct for hvx_set return +typedef struct qurt_hvx_set_return_str_ { + unsigned char result; // LSB + unsigned char hvx_mode_assigned; + unsigned short resvd; // MSB +} qurt_hvx_set_return_struct_t; // 4 bytes +/** @endcond */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_hvx_lock + Locks one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns right away. + If the current HVX mode is different from the requested mode, the current + thread is blocked. When all HVX units become idle, QuRT changes + the mode, locks the HVX unit, and returns. + + Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is + mapped as qurt_hvx_set(64_BYTE or 128_BYTE). + + @datatypes + #qurt_mode_t + + @param[in] lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B. + + @return + #QURT_EOK -- Success \n + Other value -- Failure + + @dependencies + None. + + */ +int qurt_hvx_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_unlock + Unlocks the HVX unit held by this software thread. + + @note1hang Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock() + maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT). + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_unlock(void); + +/**@ingroup func_qurt_hvx_try_lock + Tries to lock one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns #QURT_EOK; Otherwise, + the function returns a failure, but does not block the current software + thread to wait for the HVX unit. + Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock() + maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask); + + @datatypes + #qurt_mode_t + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_get_mode + Gets the current HVX mode configured by QuRT. + + @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on + the current HVX configuration. + + @param[out] + None. + + @return + #QURT_HVX_MODE_128B \n + #QURT_HVX_MODE_64B \n + -1 -- Not available. + + @dependencies + None. + */ +int qurt_hvx_get_mode(void); + + +/**@ingroup func_qurt_hvx_get_units + Gets the HVX hardware configuration that the chipset supports. + + @note1hang The function returns the HVX hardware configuration supported by the chipset. + + @return + Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n + - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n + - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n + - 0 -- not available + + @dependencies + None. + + */ +int qurt_hvx_get_units(void); + + +/**@ingroup func_qurt_hvx_reserve + Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + If one HVX unit is already locked by the application in the same PD, the unit is + added to the returned count as one reserved unit for the PD. + Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve() + only does basic sanity checks on HVX units. + + @datatypes + None. + + @param[in] num_units Number of HVX units in terms of 64B_MODE to reserve for the PD. + QURT_HVX_RESERVE_ALL to reserve all the HVX units. + QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units. + + @return + Number of units successfully reserved, including the units already locked in the same PD. \n + #QURT_HVX_RESERVE_NOT_SUPPORTED \n + #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n + #QURT_HVX_RESERVE_ALREADY_MADE + + + @dependencies + None. + + */ +int qurt_hvx_reserve(int num_units); + + +/**@ingroup func_qurt_hvx_cancel_reserve + Cancels the HVX reservation in the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + + @return + 0 -- Success \n + #QURT_HVX_RESERVE_CANCEL_ERR -- Failure + + @dependencies + None. + + */ +int qurt_hvx_cancel_reserve(void); + + +/**@ingroup func_qurt_hvx_get_lock_val + Gets the HVX locking status value of the thread of the caller. + + @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not. + + @datatypes + None. + + @return + #QURT_HVX_UNLOCKED \n + #QURT_HVX_LOCKED \n + #QURT_HVX_ERROR + + @dependencies + None. + */ +int qurt_hvx_get_lock_val(void); + +/** @cond internal_only*/ +/**@ingroup func_qurt_hvx_set + Sets the HVX configuration for the software thread of the caller. + + @datatypes + None. + + @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask + | hvx_acceptable_mode_mask where set_request can be set to: \n + - #QURT_HVX_64B \n + - #QURT_HVX_128B \n + - #QURT_HVX_NO_USE \n + - #QURT_HVX_RELEASE_CONTEXT \n + - #QURT_HVX_IMMEDIATE_USE \n + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_preferred_mode_mask can be set to: \n + - #QURT_HVX_64B_PREFERRED \n + - #QURT_HVX_128B_PREFERRED + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_acceptable_mode_mask can be set to: \n + - #QURT_HVX_64B_ACCEPTABLE \n + - #QURT_HVX_128B_ACCEPTABLE @tablebulletend + + @return + Result of the HVX setting in the least significant 8 bits of the returned data. \n + #QURT_EOK -- 0 \n + #QURT_HVX_SET_ERROR -- 0xFF \n + When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, + bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n + - #QURT_HVX_64B_ASSIGNED \n + - #QURT_HVX_128B_ASSIGNED + + @dependencies + None. + */ +unsigned int qurt_hvx_set(unsigned int input_arg); + + +/**@ingroup func_qurt_system_hvx_regs_get_maxsize + Returns the maximum buffer size for saving HVX registers. + + @datatypes + None. + + @return + 0 -- No HVX supported in the target. \n + #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers. + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get_maxsize(void); + + +/**@ingroup func_qurt_system_hvx_regs_get_size + Returns the buffer size for saving HVX registers for a specified thread. + + @param[in] thread_id Thread ID of the target thread. + + @return + 0 -- No HVX assgined to the thread. \n + size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n + - #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + + @dependencies + None. + + */ +unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id); + + + +/**@ingroup func_qurt_system_hvx_regs_get + Saves the HVX registers into the specified buffer. + Returns the size of the data saved into the buffer. + After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer + from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0. + + @param[in] thread_id Thread ID of the target thread. + @param[in] pBuf Pointer to the buffer for HVX register saving. + The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from + the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. + For example, a buffer can be declared at first as: \n + unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n + unsigned char *pBuf; \n + then align the buffer pointer to: \n + pBuf = vbuf; \n + pBuf += (256 - 4 - (unsigned)pBuf%256); + @param[in] size Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that + returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above. + @param[out] pBuf Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith + byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes + contain one of the HVX dump versions:\n + - #QURT_HVX_DUMP_V65_64B \n + - #QURT_HVX_DUMP_V65_128B \n + - #QURT_HVX_DUMP_V66_128B \n + - #QURT_HVX_DUMP_V68_128B \n + - #QURT_HVX_DUMP_V79_128B \n + @tablebulletend + + @return + Total bytes of the data saved in the provided buffer. \n + 0 -- No HVX assigned to the thread \n + #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HVX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_int.h new file mode 100755 index 0000000000000..386aeda1051eb --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_int.h @@ -0,0 +1,509 @@ +#ifndef QURT_INT_H +#define QURT_INT_H +/** + @file qurt_int.h + @brief QuRT interrupt functions. + + + + Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/** @cond rest_reg_dist */ +/** @addtogroup interrupts_constants +@{ */ +#define SIG_INT_ABORT 0x80000000 /**< */ +#define QURT_INT_NON_DELAYED_ACK 0 +#define QURT_INT_DELAYED_ACK 1 +#define QURT_INT_ACK_DEFAULT QURT_INT_NON_DELAYED_ACK +#define QURT_INT_DRV_DEFAULT 0 +#define QURT_INT_PRIORITY_DEFAULT 0xFF + +/** QuRT interrupt property. */ +#define QURT_INT_CONFIGID_POLARITY 0x1U /**< */ +#define QURT_INT_CONFIGID_LOCK 0x2U /**< */ + +/** QuRT interrupt lock.*/ +#define QURT_INT_LOCK_DEFAULT 0x0 /**< Default. */ +#define QURT_INT_LOCK_DISABLE 0x0 /**< Interrupt can be enabled or disabled or deregistered. */ +#define QURT_INT_LOCK_ENABLE 0x1 /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/ +/** @} */ /* end_addtogroup interrupts_constants */ + +/** @addtogroup Qurt_interrupt_type +@{ */ +/** Trigger type bit fields for a PDC interrupt:\n + @verbatim + Polarity Edge Output\n + 0 00 Level sensitive active low + 0 01 Rising edge sensitive + 0 10 Falling edge sensitive + 0 11 Dual edge sensitive + 1 00 Level sensitive active high + 1 01 Falling edge sensitive + 1 10 Rising edge sensitive + 1 11 Dual edge sensitive + @endverbatim +*/ +#define QURT_INT_TRIGGER_TYPE_SET(pol, edge) ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */ + +#define QURT_INT_TRIGGER_LEVEL_LOW QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_LEVEL_HIGH QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_RISING_EDGE QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_FALLING_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_DUAL_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U) /**< */ +#define QURT_INT_TRIGGER_USE_DEFAULT 0xffU /**< */ +/** @} */ /* end_addtogroup Qurt_interrupt_type */ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_interrupt_register + @xreflabel{sec:interrupt_register} + Registers the interrupt.\n + Enables the specified interrupt and associates it with the specified QuRT signal object and + signal mask. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask); + +/**@ingroup func_qurt_interrupt_register2 + @xreflabel{sec:interrupt_register2} + Registers the interrupt.\n + Enables the specified interrupt, associates it with the specified QuRT signal object and + signal mask, and sets interrupt flags. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals that the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value #QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value #QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + #QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + @param[in] flags Defines interrupt property, supported property is interrupt lock enable/disable. + Possible values for flags: \n + - #QURT_INT_LOCK_ENABLE + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags); +/* + * Waits for registered interrupt signal + + * Suspend the current thread until one of its registered interrupts occurs. The second input mask, + * contains the interrupt signals the IST expects to receive. The interrupt signals are registered + * with interrupts via qurt_register_interrupt API. + * + * The signals returned in the signal variable indicate which interrupts occurred. Use function + * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to + * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST + * must quit from interrupt receiving loop. + * + * For detail information on this API, see QuRT User Manual Section 4.2.5 + * + * Prototype + * + * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask) + */ + +/**@ingroup func_qurt_interrupt_acknowledge + Acknowledges an interrupt after it has been processed.\n + Re-enables an interrupt and clears its pending status. This is done after an interrupt is + processed by an IST. + + Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST + performs the acknowledge operation after it has finished processing the interrupt and + just before suspending itself (such as by waiting on the interrupt signal). + + @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt, + an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before + acknowledging the interrupt. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Interrupt acknowledge was successful. \n + #QURT_EDEREGISTERED -- Interrupt is already de-registered. + + @dependencies + None. +*/ +int qurt_interrupt_acknowledge(int int_num); + +/**@ingroup func_qurt_interrupt_deregister + Disables the specified interrupt and disassociates it from a QuRT signal object. + If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation + returns the status value #QURT_EINT. + + @note1hang If an interrupt is deregistered while an IST waits + to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid + this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an + IST after determining that it has no interrupts registered. + + @param[in] int_num L2VIC to deregister; valid range is 0 to 1023. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number (not registered). + + @dependencies + None. + +*/ +unsigned int qurt_interrupt_deregister(int int_num); +/** @endcond */ + +/**@ingroup func_qurt_interrupt_disable + Disables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + After qurt_interrupt_disable() returns, the Hexagon subsystem + can no longer send the corresponding interrupt to the Hexagon + core, until qurt_interrupt_enable() is called + for the same interrupt. + + Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within + a short period of time.\n + - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() + is called. Therefore, some time later, the pending interrupt is received on a Hexagon + hardware thread.\n + - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon + hardware automatically disables the interrupt until kernel software re-enables the interrupt + at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain + thread at an ealier time, the interrupt is re-enabled earlier and can trigger + sending a new interrupt to the Hexagon core while kernel software is still processing + the previous interrupt. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully disabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_disable(int int_num); + + +/**@ingroup func_qurt_interrupt_enable + Enables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully enabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. + +*/ + unsigned int qurt_interrupt_enable(int int_num); + + +/**@ingroup func_qurt_interrupt_status + Returns a value that indicates the pending status of the specified interrupt. + + @param[in] int_num Interrupt number that is being checked. + @param[out] status Interrupt status; 1 indicates that an interrupt is + pending, 0 indicates that an interrupt is not pending. + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_status(int int_num, int *status); + + +/**@ingroup func_qurt_interrupt_get_status + Gets the status of the specified interrupt in L2VIC. + + @param[in] int_num Interrupt number that is being checked. + @param[in] status_type 0 -- interrupt pending status \n + 1 -- interrupt enabling status + @param[out] status 0 -- OFF \n + 1 -- ON + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status); + +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_interrupt_clear + Clears the pending status of the specified interrupt. + + @note1hang This operation is intended for system-level use, and must be used with care. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_clear(int int_num); + + +/**@ingroup func_qurt_interrupt_get_config + Gets the L2VIC interrupt configuration. \n + This function returns the type and polarity of the specified L2VIC interrupt. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[out] int_type Pointer to an interrupt type. \n + 0 -- Level-triggered interrupt \n + 1 -- Eedge-triggered interrupt + @param[out] int_polarity Pointer to interrupt polarity.\n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt. + + @return + #QURT_EOK -- Configuration successfully returned.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity); + +/**@ingroup func_qurt_interrupt_set_config + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang Deregister L2VIC interrupts before reconfiguring them. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Interrupt type. \n + 0 -- Level-triggered interrupt\n + 1 -- Edge-triggered interrupt + @param[in] int_polarity Interrupt polarity. \n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity); + +/**@ingroup func_qurt_interrupt_set_config2 + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Notified to the hardware configuration callback function and used to + modify the L2VIC type. Possible values: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type); + +/**@ingroup func_ qurt_interrupt_set_config3 + Sets the specified configuration value for the specified property of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity. + + @param[in] int_num L2VIC interrupt to re-enable. + @param[in] config_id Property to configure: \n + - #QURT_INT_CONFIGID_POLARITY \n + - #QURT_INT_CONFIGID_LOCK @tablebulletend + @param[in] config_val Dependent on the second argument config_id, specifies the value to set. \n + Values for #QURT_INT_CONFIGID_POLARITY: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE \n + + Values for #QURT_INT_CONFIGID_LOCK: \n + - #QURT_INT_LOCK_ENABLE\n + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. +*/ +unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val); + + +/**@ingroup func_qurt_interrupt_raise + Raises the interrupt. \n + This function triggers a level-triggered L2VIC + interrupt, and accepts interrupt numbers in the range of 0 to 1023. + + @param[in] interrupt_num Interrupt number. + + @return + #QURT_EOK -- Success \n + -1 -- Failure; the interrupt is not supported. + + @dependencies + None. + */ +int qurt_interrupt_raise(unsigned int interrupt_num); + +/**@ingroup func_qurt_interrupt_raise2 + Raises the interrupt and returns the current pcycle value. + + @param[in] interrupt_num Interrupt number. + + @return + 0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n + Other value -- pcycle count at the time the interrupt is raised. + + @dependencies + None. + */ +unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_isr_subcall + Indicates whether the current function is called from a callback procedure (either short or long). + + @return + #QURT_EOK -- TRUE \n + #QURT_EVAL -- FALSE. + + @dependencies + None. + */ +int qurt_isr_subcall(void); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_INT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_island.h new file mode 100755 index 0000000000000..f0c8ee27cf8b0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_island.h @@ -0,0 +1,122 @@ +#ifndef QURT_ISLAND_H +#define QURT_ISLAND_H + +/** + @file qurt_island.h + @brief Prototypes of power API + The APIs allow entering and exiting island mode where the memory + accesses are limited to local memory. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_island_get_status + Gets Island mode status. + + Returns a value that indicates whether the QuRT system executes in Island mode. + + @return + 0 - Normal mode. \n + 1 - Island mode. + + @dependencies + None. +*/ +unsigned int qurt_island_get_status (void); + +/**@ingroup func_qurt_island_get_status2 + Gets Island mode status especially that differentiates between island partial exit and complete exit. + + Returns a value that indicates the current state. + + @note1hang Transition from NORMAL mode to ISLAND mode happens in single + threaded mode. Whereas transition from ISLAND mode to other modes + happen in multi-threaded mode. So, a thread that gets island mode + status as NORMAL can assume the same status till it continues to + run. A thread that gets island mode status as ISLAND should + assume that the status may change to EXITING or NORMAL while it + runs. A thread that gets island mode status as EXITING should + assume that the status may change to NORMAL while it runs. If + the thread goes to wait state in after reading the status, it should get + the island mode state again and not assume the previous state. + @note2hang This api returns more intrinsic states than qurt_island_get_status, + when qurt_island_get_status returns 0, this api could return + QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND + + @param[in/out] data field is reserved for future use. If NULL pointer is passed, + the field will be ignored. If a valid pointer is passed, + QuRT will return back a bitmask which can be interpreted as follows: + data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. + Otherwise set to 0. + data[30:0] – Reserved for future definition. + + @return + QURT_ISLAND_MODE_NORMAL - Main mode \n + QURT_ISLAND_MODE_ISLAND - Island mode \n + QURT_ISLAND_MODE_EXITING - Exiting Island mode \n + + @dependencies + None. +*/ +unsigned int qurt_island_get_status2 (unsigned int *data); + + + +/**@ingroup func_qurt_island_get_exit_status + Gets the reason for the last Island mode exit status. + + @param[out] cause_code Pointer that returns the cause code of the last + island exit reason. \n + - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n + - #QURT_ENOISLANDENTRY -- API called before exiting island. \n + - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend + + @param[out] int_num Pointer that holds the invalid interrupt number that caused + island exit when the cause code is #QURT_EISLANDINVALIDINT. + For other cases, it is -1. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num); + +/**@ingroup func_qurt_island_get_enter_timestamp + Gets the recent timestamp when the system exits STM during island enter. + + @param[out] island_enter_timestamp Returns a pointer to the recent timestamp + recorded after the system exits STM during island enter. If the system never + attempts to enter island, the island_enter_timestamp return pointer holds a value + of zero. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISLAND_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_isr.h new file mode 100755 index 0000000000000..db29ea2f265d7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_isr.h @@ -0,0 +1,177 @@ +#ifndef QURT_ISR_H +#define QURT_ISR_H + +/*===================================================================== + + @file qurt_isr.h + + @brief Prototypes of Qurt ISR API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2017, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + Functions +=============================================================================*/ + + +/**@ingroup func_qurt_isr_set_hw_config_callback + Set callback function for the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_config_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_enable_callback + Set callback function for enabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_enable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_disable_callback + Set callback function for disabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_disable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_create + Creates an ISR thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + + @return + #QURT_EVAL -- Invalid arguments + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr); + +/**@ingroup func_qurt_isr_register2 + Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes. + The interrupt is enabled when this function returns success. + + @datatypes + qurt_thread_t + + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create() + @param[in] int_num The interrupt number + @param[in] prio Priority of the ISR + @param[in] flags Defines ACK type. Values : \n + QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the Kernel. + QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + @param[in] int_type. Notifies it to registered function. Values: \n + - QURT_INT_TRIGGER_USE_DEFAULT + - QURT_INT_TRIGGER_LEVEL_HIGH + - QURT_INT_TRIGGER_LEVEL_LOW + - QURT_INT_TRIGGER_RISING_EDGE + - QURT_INT_TRIGGER_FALLING_EDGE + - QURT_INT_TRIGGER_DUAL_EDGE + @param[in] isr Interrupt Service Routine with proto type void isr (void *arg, int int_num) + @param[in] arg 1st argument of the ISR when it is called to service the interrupt + + @return + QURT_EOK -- Successfully registered the ISR for the interrupt + QURT_EINT -- Interrupt not configured + QURT_EINVALID -- Invalid Thread ID + QURT_EDISABLED -- The feature is disabled + QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_isr_deregister2 + De-registers the ISR for the specified interrupt. + The interrupt is disabled when this function returns success. + + @param[in] int_num The interrupt number + + @return + QURT_EOK -- ISR deregistered successfully + QURT_ENOREGISTERED -- Interrupt with int_num is not registered + + @dependencies + None. + */ +int qurt_isr_deregister2 (int int_num); + +/**@ingroup func_qurt_isr_delete + ISR thread will exit and releases Kernel resources + + @note1hang The ISR thread shouldn't be actively processing interrupts, + otherwise the call will fail and return an error. + + @param[in] thread-id of the ISR thread that needs to be deleted. + + @return + QURT_ENOTALLOWED -- ISR thread is processing an interrupt + QURT_EINVALID -- Invalid ISR thread ID + QURT_EOK -- Success + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_delete (qurt_thread_t isr_tid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISR_H */ + + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_l2cfg.h new file mode 100755 index 0000000000000..7e26b30a580d9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_l2cfg.h @@ -0,0 +1,98 @@ +#ifndef QURT_L2CFG_H +#define QURT_L2CFG_H +/** + @file qurt_l2cfg.h + @brief QuRT APIs for L2 configuration and system configuration + +EXTERNAL FUNCTIONS + qurt_l2cfg_set + qurt_l2cfg_get + qurt_system_config_get + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +/* Definition for system configuration */ +/** @addtogroup l2cfg_macros +@{ */ +#define QURT_CORE_CFG_HMX_INT8_SPATIAL 0x78 /**< HMX fixed-point spatial size */ +#define QURT_CORE_CFG_HMX_INT8_DEPTH 0x7C /**< HMX fixed-point output depth */ +/** @} */ /* end_addtogroup l2cfg_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_l2cfg_set + Sets the value of a L2 configuration register. A register can be set *IFF* its + initial value is configured. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[in] value Value to set the register to. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely + a configuration problem. \n + #QURT_EINVALID -- Argument error. \n + #QURT_ENOTALLOWED -- Setting this register is prohibited. + + @dependencies + None. + */ +int qurt_l2cfg_set (unsigned short offset, unsigned int value); + +/**@ingroup func_qurt_l2cfg_get + Gets the value of a L2 configuration register. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[out] value Pointer to value of the register. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; + likely a configuration problem. \n + #QURT_EINVALID -- Argument error. + + @dependencies + None. + + */ +int qurt_l2cfg_get (unsigned short offset, unsigned int * value); + + +/**@ingroup func_qurt_system_config_get + Gets the system configuration information. + + @param[in] index Index to system configuration. Values:\n + - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n + - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend + + @param[out] data Pointer to a word for returned data. + + @return + #QURT_EOK -- Get the configuration data successful. \n + Other values -- Failure (no such configuration available). + + @dependencies + None. + + */ +int qurt_system_config_get(int index, unsigned int *data); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_L2CFG_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_lifo.h new file mode 100755 index 0000000000000..dc399fccc5f0f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_lifo.h @@ -0,0 +1,71 @@ +#ifndef QURT_LIFO_H +#define QURT_LIFO_H +/** + @file qurt_lifo.h + + @brief + Provide lock free LastInFirstOut algorithm, which can be used in a + variety of situations for allocation/free fixed size buffer + This implementation touches the first word of your FREED buffer. Even + though it does not matter how you use it when it is allocated, you might want + to be a bit careful not to put your MAGIC number as the first field. + Because it will not hold the magic value for "freed" + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ +/** + Pops an element out of the LIFO. + + @param[in] freelist Pointer to the head of your list. + + @return + Top object from the list + + @dependencies + None. +*/ +/* ======================================================================*/ +void * qurt_lifo_pop(void *freelist); + + +/*======================================================================*/ +/** + Pushes an element into the LIFO. + + @param[in] freelist Pointer to the head of your list. + @param[in] buf Pointer to your buffer to push into the list. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_lifo_push(void *freelist, void *buf); + +void qurt_lifo_remove(void *freelist, void *buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_LIFO_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mailbox.h new file mode 100755 index 0000000000000..a6cd91c611782 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mailbox.h @@ -0,0 +1,176 @@ +#ifndef QURT_MAILBOX_H +#define QURT_MAILBOX_H + +/** + @file qurt_mailbox.h + @brief Definitions, macros, and prototypes used for QuRT mailbox + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* Definitions on typedef and return values */ + +#define QURT_MAILBOX_ID_NULL 0 +#define QURT_MAILBOX_ERROR -1 +#define QURT_MAILBOX_ID_ERROR -2 +#define QURT_MAILBOX_NON_VALID_DATA -3 +#define QURT_MAILBOX_FULL -4 +#define QURT_MAILBOX_DELETED -5 +#define QURT_MAILBOX_RECEIVE_HALTED -6 +#define QURT_MAILBOX_BANDWIDTH_LIMIT -7 + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ + +#define QURT_MAILBOX_AT_QURTOS 0U // Receiver is QurtOS +#define QURT_MAILBOX_AT_ROOTPD 1U // Receiver is RootPD (ASID=0) +#define QURT_MAILBOX_AT_USERPD 2U // Receiver is User PD (ASID!=0) +#define QURT_MAILBOX_AT_SECUREPD 3U // Receiver is Secure PD + +typedef unsigned char qurt_mailbox_receiver_cfg_t; + +#define QURT_MAILBOX_SEND_OVERWRITE 0U // When there is already valid content, overwrite it +#define QURT_MAILBOX_SEND_NON_OVERWRITE 1U // When there is already valid content, return failure + +typedef unsigned char qurt_mailbox_send_option_t; + + +#define QURT_MAILBOX_RECV_WAITING 0U // When there is no valid content, wait for it +#define QURT_MAILBOX_RECV_NON_WAITING 1U // When there is no valid content, return failure immediately +#define QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U // Read the content, but doesn't remove it from the mailbox. No waiting. + +typedef unsigned char qurt_mailbox_recv_option_t; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/* Function prototype */ + +/**@ingroup qurt_mailbox_create + Creates a QuRT mailbox. + + @param name Mailbox name up to 8 characters. + @param recv_opt Configuration on the receiver process. + + @return + Mailbox ID -- Mailbox Identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at creating mailbox + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt); + + +/**@ingroup qurt_mailbox_get_id + Gets a QuRT mailbox identifier. + + @param name Mailbox name up to 8 characters. + + @return + Mailbox ID -- Mailbox identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_get_id(char *name); + + +/**@ingroup qurt_mailbox_send + Sends data to a QuRT mailbox. + + @param mailbox_id Mailbox identifier. + @param send_opt Option for mailbox send. + @param data Data to send. + + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors.\n + #QURT_MAILBOX_FULL Valid data already exists, non-overwriting.\n + #QURT_MAILBOX_BANDWIDTH_LIMIT Reached the bandwidth limitation. + + @dependencies + None. +*/ +int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data); + + +/**@ingroup qurt_mailbox_receive + Receive data from QuRT mailbox + + @param mailbox_id Mailbox Identifier + @param send_opt Option for mailbox receiving + @param data Pointer to data buffer for receiving + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. \n + #QURT_MAILBOX_NON_VALID_DATA No current valid data, put the previous content in the buffer. \n + #QURT_MAILBOX_RECEIVE_HALTED Receive halted, the waiting thread is woken up. \n + #QURT_MAILBOX_DELETED Mailbox is deleted, and the waiting thread is woken up. + + @dependencies + None. +*/ +int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data); + + +/**@ingroup qurt_mailbox_delete + Deletes a QuRT mailbox. + + A mailbox can only be deleted from the process that created the mailbox. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_delete(unsigned long long mailbox_id); + + +/**@ingroup qurt_mailbox_receive_halt + Halts a QuRT mailbox receiving and wakes up waiting threads. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_receive_halt(unsigned long long mailbox_id); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_MAILBOX_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_memory.h new file mode 100755 index 0000000000000..90ce2586fec50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_memory.h @@ -0,0 +1,1487 @@ +#ifndef QURT_MEMORY_H +#define QURT_MEMORY_H +/** + @file qurt_memory.h + @brief Prototypes of kernel memory API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include +//#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup memory_management_macros +@{ */ +#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all + processes.*/ +/** @} */ /* end_addtogroup memory_management_macros */ +/**@cond rest_reg_dist */ +/** @addtogroup memory_management_types +@{ */ +/** @xreflabel{hdr:qurt_mem_default_pool} */ +extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/ +/** @} */ /* end_addtogroup memory_management_types */ + +/** @cond rest_reg_dist */ +/** Mapping attribute information*/ +typedef struct{ + qurt_paddr_64_t paddr; + qurt_size_t size ; + qurt_mem_cache_mode_t cache_mode; + qurt_perm_t perms ; +}qurt_mapping_attr_t; +/** @endcond */ +/** @} */ /* end_addtogroup mapping_attribute_types*/ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_mem_cache_clean + Performs a cache clean operation on the data stored in the specified memory area. + Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater. + + @note1hang Perform the flush all operation only on the data cache. + + @note1cont This operation flushes and invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed and invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_ALL\n + @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend + @param[in] type Cache type. Values: + - #QURT_MEM_ICACHE + - #QURT_MEM_DCACHE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type.\n + + @dependencies + None. +*/ +int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_clean2 + Performs a data cache clean operation on the data stored in the specified memory area. + + This API only performs the following data cache operations:\n + - #QURT_MEM_CACHE_FLUSH\n + - #QURT_MEM_CACHE_INVALIDATE\n + - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed/invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values:\n #QURT_MEM_CACHE_FLUSH\n #QURT_MEM_CACHE_INVALIDATE\n + #QURT_MEM_CACHE_FLUSH_INVALIDATE + @param[in] type Cache type. Values: \n #QURT_MEM_DCACHE + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type. + + @dependencies + None. +*/ +int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_phys_clean + Performs a cache clean operation on the data stored in the specified memory area based on address match and mask. + Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch. + + @note1hang The addrmatch value should be the upper 24-bit physical address to match against. + + @datatypes + #qurt_mem_cache_op_t \n + + @param[in] mask 24-bit address mask. + @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid operation + + @dependencies + None. +*/ + +int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode); + +/**@ingroup func_qurt_mem_l2cache_line_lock + Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory. + + @note1hang Perform the line lock operation only on the 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to lock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success.\n + #QURT_EALIGN -- Data alignment or address failure. + #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size) + #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address + in the range of addr and addr+size or the address range is not L2 cacheable + @dependencies + None. +*/ +int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_l2cache_line_unlock + Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory. + + @note1hang Perform the line unlock operation only on a 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to unlock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success. \n + #QURT_EALIGN -- Aligning data or address failure. \n + #QURT_EFAILED -- Operation failed, cannot find the matching tag. + + @dependencies + None. +*/ +int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_region_attr_init + @xreflabel{sec:qurt_mem_region_attr_init} + Initializes the specified memory region attribute structure with default attribute values: \n + - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n + - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n + - Physical address -- -1 \n + - Virtual address -- -1 \n + - Memory type -- #QURT_MEM_REGION_LOCAL \n + - Size -- -1 + + @note1hang The memory physical address attribute must be explicitly set by calling the + qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly + as parameters in the memory region create operation. + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the destination structure for the memory region attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attach + Initializes a memory pool object to attach to a pool predefined in the system + configuration file. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. They are specified in memory region create operations + (Section @xref{sec:mem_region_create}). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach + operation is necessary only when allocating memory regions in nonstandard + memory units such as TCM. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_attach2 + Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL. + The client_handle is used to look up the client specific pool. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. Memory pool objects are specified during mapping creation operations + (qurt_mem_mmap() and qurt_mem_region_create()). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2 + operation is necessary only when allocating memory regions in memory units specific to the client. + + @datatypes + #qurt_mem_pool_t + + @param[in] client_handle Client identifier used by the OS to lookup the identifier + for client specific pool + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_create + @xreflabel{hdr:qurt_mem_pool_create} + Dynamically creates a memory pool object from a physical address range. + + The pool is assigned a single memory region with the specified base address and size. + + The base address and size values passed to this function must be aligned to 4K byte + boundaries, and must be expressed as the actual base address and size values divided by 4K. + + For example, the function call: + @code + qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool) + @endcode + ... is equivalent to the following static pool definition in the QuRT system configuration file: + @code + + + + @endcode + + @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond + + @note1hang Dynamically created pools are not identical to static pools. In particular, + qurt_mem_pool_attr_get() is not valid with dynamically created pools. + + @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[in] base Base address of the memory region (divided by 4K). + @param[in] size Size (in bytes) of the memory region (divided by 4K). + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_add_pages + Adds a physical address range to the specified memory pool object.\n + + @note1hang Call this operation only with root privileges (guest OS mode). + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_mem_pool_add_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages); + +/**@ingroup func_qurt_mem_pool_remove_pages + Removes a physical address range from the specified memory pool object. + + If any part of the address range is in use, this operation returns an + error without changing the state. + + @note1hang Call this operation only with root privileges (guest-OS mode). + + @note1cont In the future, this operation will support (via the flags parameter) the + removal of a physical address range when part of the range is in use. + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + @param[in] flags Remove options. Values: \n + - 0 -- Skip holes in the range that are not part of the pool (default) \n + - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified + physical address range is entirely contained (with no holes) in the + pool free space. @tablebulletend + @param[in] callback Callback procedure called when pages were successfully removed. + Not called if the operation failed. Passing 0 as the parameter + value causes the callback to not be called. + @param[in] arg Value passed as an argument to the callback procedure. + + @return + #QURT_EOK -- Pages successfully removed. + + @dependencies + None. +*/ +int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages, + unsigned flags, + void (*callback)(void *), + void *arg); +/**@ingroup memory_management_types*/ +#define QURT_POOL_REMOVE_ALL_OR_NONE 1 /**< */ + +/**@ingroup func_qurt_mem_pool_attr_get + Gets the memory pool attributes. \n + Retrieves pool configurations based on the pool handle, and fills in + the attribute structure with configuration values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_attr_t + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[out] attr Pointer to the memory region attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attr_get_size + Gets the size of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] size Pointer to the destination variable for the range size. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*size) = 0; + return QURT_EINVALID; + } + else { + (*size) = attr->ranges[range_id].size; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr + Gets the start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; + } + else { + (*addr) = (attr->ranges[range_id].start)<<12; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr_64 + Gets the 64 bit start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_64_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){ +if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; +} +else { + (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12; + } + return QURT_EOK; + } + + +/**@ingroup func_qurt_mem_pool_status_get + Gets the memory pool status. \n + Based on the pool handle, retrieves largest contiguous free memory, + total free memory, and total memory declared for the pool in bytes. Fills in + the memory status structure with the values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_status_t + + @param[in] pool Pool handle. + @param[out] status Pointer to the memory pool status structure. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status); + + +/**@ingroup func_qurt_mem_pool_is_available + Checks whether the number of pages that the page_count argument indicates + can be allocated from the specified pool. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_mem_mapping_t \n + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[in] page_count Number of 4K pages. + @param[in] mapping_type Variable of type qurt_mem_mapping_t. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Mapping_type is invalid. \n + #QURT_EMEM -- Specified pages cannot be allocated from the pool. + + @dependencies + None. +*/ +int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type); + + +/**@ingroup func_qurt_mem_region_create + @xreflabel{sec:mem_region_create} + Creates a memory region with the specified attributes. + + The application initializes the memory region attribute structure with + qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr(). + + If the virtual address attribute is set to its default value + (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is + automatically assigned any available virtual address value. + + If the memory mapping attribute is set to virtual mapping, the physical address of the memory region + is also automatically assigned.\n + + @note1hang The physical address attribute is explicitly set in the attribute structure only + for memory regions with physical-contiguous-mapped mapping. + + Memory regions are always assigned to memory pools. The pool value specifies the memory pool + that the memory region is assigned to. + + @note1hang If attr is specified as NULL, the memory region is created with default + attribute values (Section @xref{sec:qurt_mem_region_attr_init}). + QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory. + + @datatypes + #qurt_mem_region_t \n + #qurt_size_t \n + #qurt_mem_pool_t \n + #qurt_mem_region_attr_t + + @param[out] region Pointer to the memory region object. + @param[in] size Memory region size (in bytes). If size is not an integral multiple of 4K, + it is rounded up to a 4K boundary. + @param[in] pool Memory pool of the region. + @param[in] attr Pointer to the memory region attribute structure. + + @return + #QURT_EOK -- Memory region successfully created.\n + #QURT_EMEM -- Not enough memory to create region. + #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute. + + @dependencies + None. +*/ +int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_region_delete + Deletes the specified memory region. + + If the caller application creates the memory region, it is removed and the system reclaims its + assigned memory. + + If a different application creates the memory region (and is shared with the caller + application), only the local memory mapping to the region is removed; the system does + not reclaim the memory. + + @datatypes + #qurt_mem_region_t + + @param[in] region Memory region object. + + @returns + #QURT_EOK -- Region successfully deleted. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. +*/ +int qurt_mem_region_delete(qurt_mem_region_t region); + + +/**@ingroup func_qurt_mem_region_attr_get + @xreflabel{sec:mem_region_attr_get} + Gets the memory attributes of the specified message region. + After a memory region is created, its attributes cannot be changed. + + @datatypes + #qurt_mem_region_t \n + #qurt_mem_region_attr_t + + @param[in] region Memory region object. + @param[out] attr Pointer to the destination structure for memory region attributes. + + @return + #QURT_EOK -- Operation successfully performed. \n + Error code -- Failure. + + @dependencies + None. +*/ +int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr); + + +/**@ingroup func_qurt_mem_region_attr_set_type + Sets the memory type in the specified memory region attribute structure. + + The type indicates whether the memory region is local to an application or shared between + applications. + @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in,out] attr Pointer to memory region attribute structure. + @param[in] type Memory type. Values: \n + - #QURT_MEM_REGION_LOCAL \n + - #QURT_MEM_REGION_SHARED @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){ + attr->type = type; +} + +/**@ingroup func_qurt_mem_region_attr_get_size + Gets the memory region size from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] size Pointer to the destination variable for memory region size. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){ + (*size) = attr->size; +} + +/**@ingroup func_qurt_mem_region_attr_get_type + Gets the memory type from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] type Pointer to the destination variable for the memory type. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){ + (*type) = attr->type; +} + +/**@ingroup func_qurt_mem_region_attr_set_physaddr + Sets the memory region 32-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise QuRT automatically sets it + when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){ + attr->ppn = (unsigned)(((unsigned)(addr))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr + Gets the memory region physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for memory region physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_region_attr_set_virtaddr + Sets the memory region virtual address in the specified memory attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_addr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){ + attr->virtaddr = addr; +} + +/**@ingroup func_qurt_mem_region_attr_get_virtaddr + Gets the memory region virtual address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for the memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned int)(attr->virtaddr); +} + +/**@ingroup func_qurt_mem_region_attr_set_mapping + Sets the memory mapping in the specified memory region attribute structure. + + The mapping value indicates how the memory region is mapped in virtual memory. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mapping Mapping. Values: + - #QURT_MEM_MAPPING_VIRTUAL + - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS + - #QURT_MEM_MAPPING_IDEMPOTENT + - #QURT_MEM_MAPPING_VIRTUAL_FIXED + - #QURT_MEM_MAPPING_NONE + - #QURT_MEM_MAPPING_VIRTUAL_RANDOM + - #QURT_MEM_MAPPING_INVALID @tablebulletend + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){ + attr->mapping_type = mapping; +} + +/**@ingroup func_qurt_mem_region_attr_get_mapping + Gets the memory mapping from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mapping Pointer to the destination variable for memory mapping. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){ + (*mapping) = attr->mapping_type; +} + +/**@ingroup func_qurt_mem_region_attr_set_cache_mode + Sets the cache operation mode in the specified memory region attribute structure. + + @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mode Cache mode. Values: \n + - #QURT_MEM_CACHE_WRITEBACK \n + - #QURT_MEM_CACHE_WRITETHROUGH\n + - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n + - #QURT_MEM_CACHE_NONE @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){ + QURT_PGATTR_C_SET(attr->pga, (unsigned)mode); +} + +/**@ingroup func_qurt_mem_region_attr_get_cache_mode + Gets the cache operation mode from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){ + unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga); + (*mode) = (qurt_mem_cache_mode_t)mode_temp; +} + +/**@ingroup func_qurt_mem_region_attr_set_bus_attr + Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure. + + @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] abits The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){ + QURT_PGATTR_A_SET(attr->pga, abits); +} + +/**@ingroup func_qurt_mem_region_attr_get_bus_attr + Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] pbits Pointer to an unsigned integer that is filled in with + the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){ + (*pbits) = QURT_PGATTR_A_GET(attr->pga); +} + +void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle); +void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle); +void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms); +void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms); + +/**@ingroup func_qurt_mem_map_static_query + Determines whether a memory page is statically mapped. + Pages are specified by the following attributes: physical address, page size, cache mode, + and memory permissions. \n + - If the specified page is statically mapped, vaddr returns the virtual + address of the page. \n + - If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + The system configuration file defines QuRT memory maps. + + @datatypes + #qurt_addr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr Physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n + #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + + +/**@ingroup func_qurt_mem_region_query + Queries a memory region. \n + This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. + When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_paddr_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr Physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Query successfully performed. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr); + + +/**@ingroup func_qurt_mapping_create + @xreflabel{hdr:qurt_mapping_create} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Mapping created. \n + #QURT_EMEM -- Failed to create mapping. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove + @xreflabel{hdr:qurt_mapping_remove} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Mapping created. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr + Translates a virtual memory address to the physical memory address to which it maps. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the + physical address of another process. + + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- Physical address to which the virtual address is mapped.\n + 0 -- Virtual address not mapped. + + @dependencies + None. +*/ +qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr); + +/**@ingroup func_qurt_mem_region_attr_set_physaddr_64 + Sets the memory region 64-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise it is automatically set by + QuRT when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr_64 Memory region 64-bit physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){ + attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr_64 + Gets the memory region 64-bit physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr_64 Pointer to the destination variable for the memory region 64-bit physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){ + (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_map_static_query_64 + Determines if a memory page is statically mapped. + The following attributes specify pages: 64-bit physical address, page size, cache mode, + and memory permissions. \n + If the specified page is statically mapped, vaddr returns the virtual + address of the page. + If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + QuRT memory maps are defined in the system configuration file. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr_64 64-bit physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n + #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mem_region_query_64 + Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr_64 64-bit physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64); + +/**@ingroup func_qurt_mapping_create_64 + @xreflabel{hdr:qurt_mapping_create_64} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Failure. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove_64 + @xreflabel{hdr:qurt_mapping_remove_64} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Success. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr_64 + Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical + address of another process. + + @datatypes + #qurt_paddr_64_t \n + #qurt_addr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address has not been mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_mapping_reclaim + Deallocates all QuRT resources associated with the specified virtual + memory area, making it available for user memory management:\n + - The associated physical memory areas are freed and added to the + specified physical pool.\n + - The associated TLB entries are deleted and made available for TLB + management.\n + - The virtual memory area is not freed -- it is left in + place as allocated, but unmapped virtual memory. Access to this + memory area generates an exception.\n + + The virtual memory area must be statically allocated. + If no pool is specified, the freed physical memory is not added to any pool. + + @note1hang The virtual memory area is restricted to being filled with locked + TLB entries that are contiguous within the memory area, and contained by it. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_pool_t + + @param[in] vaddr Virtual address of the memory area to free. + @param[in] vsize Size (in bytes) of the memory area to free. + @param[in] pool Handle to the physical pool where freed physical memory is added. + If set to 0, freed physical memory is not added to any pool. + + @return + 0 -- Success. \n + Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that + QuRT logs messages related to the failure, and callers are free to ignore the return value. + + @dependencies + None. +*/ +int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_mem_configure_cache_partition + Configures the Hexagon cache partition at the system level. + + A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache. + + The L1 cache partition is not supported in Hexagon processor version V60 or greater. + + @note1hang Call this operation only with QuRT OS privilege. + + @datatypes + #qurt_cache_type_t \n + #qurt_cache_partition_size_t + + @param[in] cache_type Cache type for partition configuration. Values: \n + - #HEXAGON_L1_I_CACHE \n + - #HEXAGON_L1_D_CACHE \n + - #HEXAGON_L2_CACHE @tablebulletend + + @param[in] partition_size Cache partition size. Values: \n + - #FULL_SIZE \n + - #HALF_SIZE \n + - #THREE_QUARTER_SIZE \n + - #SEVEN_EIGHTHS_SIZE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Error. + + @dependencies + None. + */ +int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size); + + +/**@ingroup func_qurt_mem_syncht + @xreflabel{hdr:qurt_mem_syncht} + Performs heavy-weight synchronization of memory transactions. + + This operation does not return until all previous memory transactions (cached and uncached load/store, + mem_locked, and so on) that originated from the current thread are complete and globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_syncht(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" SYNCHT \n"); + #endif +} + +/**@ingroup func_qurt_mem_barrier + @xreflabel{hdr:qurt_mem_barrier} + Creates a barrier for memory transactions. + + This operation ensures that all previous memory transactions are globally observable before any + future memory transactions are globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction. + @return + None + + @dependencies + None. + */ +static inline void qurt_mem_barrier(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" BARRIER \n"); + #endif +} +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_system_mem_alloc + Requests that the kernel allocates memory from the kernel-owned pool. + + @param[in] size Size in bytes (aligned to 4K) to allocate. + @param[in] align Any alignment that must be considered for the allocation. + @param[in] flags Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates + available virtual memory in the address space of all processes. + + @return + #QURT_EFATAL -- Allocation failed \n + Start address of the successful allocation. + + @dependencies + None. +*/ +unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags); +/** @endcond */ +/** @cond rest_reg_dist*/ +/**@ingroup func_qurt_lookup_physaddr2 + Translates the virtual memory address of the specified process to the 64-bit + physical memory address to which it is mapped. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[in] vaddr Virtual address. + @param[in] pid PID. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address is not mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid); +/** @endcond */ + +/**@ingroup func_qurt_mapping_attr_get + Gets the mapping attributes for a given virtual address and PID + + @datatypes + #qurt_addr_t \n + #qurt_mapping_attr_t + + @param[in] vaddr virtual address for which the attributes are required. + @param[in] pid process id for the target process + @param[out] attr Pointer to the mapping attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Incorrect virtual address or pid +*/ +int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr); + + +/**@ingroup func_qurt_mapping_attr_get_cache_mode + Gets the cache operation mode in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] cache_mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode) +{ + (*cache_mode) = attr->cache_mode; +} + +/**@ingroup func_qurt_mapping_attr_get_physaddr + Gets the physical memory address in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] physaddr Pointer to the destination variable for physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr) +{ + (*physaddr) = attr->paddr; +} + +/**@ingroup func_qurt_mapping_attr_get_perms + Gets the permissions in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_perm_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] perms Pointer to the destination variable for permissions. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms) +{ + (*perms) = attr->perms; +} + +/**@ingroup func_qurt_mapping_attr_get_size + Gets the size in the specified memory mapping attribute structure.This represents size of the + TLB entry which covers the virtual address. + + + @datatypes + #qurt_mapping_attr_t \n + #unsigned int + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] size Pointer to the destination variable for size. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size) +{ + (*size) = attr->size; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MEMORY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mmap.h new file mode 100755 index 0000000000000..c3bd875910af7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mmap.h @@ -0,0 +1,359 @@ +#ifndef QURT_MMAP_H +#define QURT_MMAP_H +/** + @file qurt_mmap.h + @brief Prototypes of memory mapping/unmapping APIs. + The APIs allow the user to map, un-map, and change permissions + on memory regions. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_mem_mmap + Creates a memory mapping with the specified attributes. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that specifies a pool handle + if the user wants to allocate memory from a specific pool. + The default value for this argument is NULL. + @param[in] pRegion Map region. This argument is unused, and the default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + @param[in] flags Mapping modes.\n + - #QURT_MAP_NAMED_MEMSECTION + - #QURT_MAP_FIXED \n + - #QURT_MAP_NONPROCESS_VPOOL \n + - #QURT_MAP_TRYFIXED \n + - #QURT_MAP_ANON \n + - #QURT_MAP_PHYSADDR \n + - #QURT_MAP_VA_ONLY @tablebulletend + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap2 + Creates a memory mapping with the specified attributes. Returns a more descriptive + error code in case of failure. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that allows the user to specify a pool handle + when the user wants to allocate memory from a specific pool. + Default value for this argument is NULL. + @param[in] pRegion Map region (unused argument); default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode. + @param[in] flags Mapping modes; + Shared, Private, or Anonymous. + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_EMEM -- Physical address is not available. \n + #QURT_EFAILED -- VA is not available or mapping failed.\n + #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA). + */ +void *qurt_mem_mmap2(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap_by_name + Creates a memory mapping for a named-memsection using the specified attributes. + The named memsection should be specified in cust_config.xml. + + @note1hang If the specified attributes are not valid or the named memsection is not found, + an error result is returned. + + @param[in] name Name of the memsection in cust_config.xml that specifies + this mapping. Should be less than 25 characters. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode + @param[in] flags Mapping modes, such as + Shared, Private, or Anonymous. + @param[in] offset Offset relative to the physical address range specified in memsection. + If offset + length exceeds size of memsection, failure is + returned. + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap_by_name(const char* name, + void *addr, + size_t length, + int prot, + int flags, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mprotect2 + Changes access permissions and attributes on an existing mapping based on the client_handle argument. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned. + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping.\n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect2(int client_handle, const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_mprotect + Changes access permissions and attributes on an existing mapping. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned.\n + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect(const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_munmap + Removes an existing mapping. + + @note1hang If the specified mapping is not found in the context of the caller process + or invalid attributes are passed, an error code is returned. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap(void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap2 + Removes an existing mapping for a specified process. + + @note1hang This API allows a root process entity, such as a driver, to remove mapping + that was created for a user process. If the specified mapping is not found in the context + of client handle or invalid attributes are passed, an error code is returned. + + @param[out] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap2(int client_handle, + void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap3 + Removes an existing mapping or reservation for a specified process. + + @param[in] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Pointer to a virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] flags Specifies the flag. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap3(int client_handle, + void *addr, + size_t length, + int flags); + +/* +|| The macros here follow the style of the standard mmap() macros, but with +|| QURT_ prepended to avoid name conflicts, and to avoid having a dependency +|| on sys/mman.h. +|| +|| Wherever possible, any values here that are also present in sys/mman.h +|| should have the same value in both places so that we can accept "mmap" +|| calls without having to remap parameters to new values. +|| +|| In the future, it would be desirable to have a regression test that +|| checks, for instance, that these macros match. Example: +|| +|| assert(QURT_MAP_FAILED == MAP_FAILED); +|| ... repeat as needed ... +*/ + +/** @addtogroup memory_mapping_macros +@{ */ +/** @cond */ +#define QURT_PROT_NONE 0x00U /**< */ +#define QURT_PROT_READ 0x01U /**< */ +#define QURT_PROT_WRITE 0x02U /**< */ +#define QURT_PROT_EXEC 0x04U /**< */ +#define QURT_PROT_NODUMP 0x08U /**< Skip dumping the mapping. During PD dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and the DSP process + crashes before the mapping is removed.*/ +#define QURT_PROT_ISLAND 0x10U /**< Island mapping. */ + +#define QURT_MAP_SHARED 0x0001U /**< Shared. */ +#define QURT_MAP_PRIVATE 0x0002U /**< Private. */ +/** @endcond */ +#define QURT_MAP_NAMED_MEMSECTION 0x0004U /**< Named memsection. */ +#define QURT_MAP_FIXED 0x0010U /**< Fixed virtual address. */ +#define QURT_MAP_RENAME 0x0020U /**< Rename. */ +#define QURT_MAP_NORESERVE 0x0040U /**< No reserve. */ +#define QURT_MAP_INHERIT 0x0080U /**< Inherit. */ +#define QURT_MAP_NONPROCESS_VPOOL 0x0100U /**< Use a virtual address outside of the default range of the + processes. This option is only supported in the root process + and only when virtual memory split is enabled in the XML. + The root process can use this flag to create mapping for a + user process, for example, if the virtual address is configured + for a 3G/1G split, the root process can use this flag to create + mapping in the top 1 GB area for the user process or the + lower 3 GB area for the root process. This is useful for + shared buffer use cases. */ +#define QURT_MAP_HASSEMAPHORE 0x0200U /**< Has semaphore. */ +#define QURT_MAP_TRYFIXED 0x0400U /**< Try to create a mapping for a virtual address that was passed. + If the passed virtual address fails, use a random virtual address. */ +#define QURT_MAP_WIRED 0x0800U /**< Wired. */ +#define QURT_MAP_FILE 0x0000U /**< File. */ +#define QURT_MAP_ANON 0x1000U /**< Allocate physical memory from the pool that was passed. + By default, memory is allocated from the default physpool. */ +#define QURT_MAP_VA_ONLY 0X2000U /**< Reserve a virtual address without + mapping it. */ + +/** @cond */ +#define QURT_MAP_ALIGNED(n) ((n) << QURT_MAP_ALIGNMENT_SHIFT) +#define QURT_MAP_ALIGNMENT_SHIFT 24 + + +#define QURT_MAP_ALIGNMENT_MASK QURT_MAP_ALIGNED(0xff) /**< */ +#define QURT_MAP_ALIGNMENT_64KB QURT_MAP_ALIGNED(16) /**< */ +#define QURT_MAP_ALIGNMENT_16MB QURT_MAP_ALIGNED(24) /**< */ +#define QURT_MAP_ALIGNMENT_4GB QURT_MAP_ALIGNED(32) /**< */ +#define QURT_MAP_ALIGNMENT_1TB QURT_MAP_ALIGNED(40) /**< */ +#define QURT_MAP_ALIGNMENT_256TB QURT_MAP_ALIGNED(48) /**< */ +#define QURT_MAP_ALIGNMENT_64PB QURT_MAP_ALIGNED(56) /**< */ +/** @endcond */ +#define QURT_MAP_FAILED ((void *) -1) /**< Mapping creation failed. */ + +/* +|| The macros below are extensions beyond the standard mmap flags, but follow +|| the style of the mmap flags. +*/ +/** @cond */ +// Describe bitfields in (prot) +#define QURT_PROT_CACHE_BOUNDS 16U,19U,7U /**< Bits 16 through 19 are cache attribute, default is 0. */ +#define QURT_PROT_BUS_BOUNDS 20U,21U,0U /**< Bits 20 through 21 are bus attributes, default is 0. */ +#define QURT_PROT_USER_BOUNDS 22U,23U,3U /**< Bits 22 through 23 are user mode, default is 3; + default of 3 means to derive user mode setting from the + default mode of the client. */ + +// Describe bitfields in (flags) +#define QURT_MAP_PHYSADDR_BOUNDS 15U,15U,0U /**< Bits 15 through 15 are physaddr, default is 0. */ +#define QURT_MAP_TYPE_BOUNDS 16U,19U,0U /**< Bits 16 through 19 are mapping type, default is 0. */ +#define QURT_MAP_REGION_BOUNDS 20U,23U,0U /**< Bits 20 through 23 are region type, default is 0. */ +/** @endcond */ + +// These macros get OR'ed into (prot) +#define QURT_PROT_CACHE_MODE(n) QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_BUS_ATTR(n) QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_USER_MODE(n) QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n)) /**< */ +// These macros get OR'ed into (flags) + +#define QURT_MAP_PHYSADDR QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. + This is allowed only for root process. */ +#define QURT_MAP_TYPE(n) QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_REGION(n) QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n)) /**< */ +/** @} */ /* end_addtogroup memory_mapping_macros */ +/** @cond */ +// These macros extract fields from (prot) +#define QURT_PROT_GET_CACHE_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_BUS_ATTR(n) QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_USER_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n)) /**< */ + +// These macros extract fields from (flags) +#define QURT_MAP_GET_TYPE(n) QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_GET_REGION(n) QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */ + +// Macros for bitfield insertion and extraction +#define QURT_MMAP_MASK(lo,hi) (~((~0u) << ((hi)-(lo)+1U))) /**< Mask of same size as [lo..hi]. */ +#define QURT_MMAP_BUILD_(lo,hi,def,n) ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */ +#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */ +#define QURT_MMAP_BUILD(a,b) QURT_MMAP_BUILD_(a,b) /**< */ +#define QURT_MMAP_EXTRACT(a,b) QURT_MMAP_EXTRACT_(a,b) /**< */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mq.h new file mode 100755 index 0000000000000..580c83d3de41a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mq.h @@ -0,0 +1,458 @@ +#ifndef QURT_MQ_H +#define QURT_MQ_H +/** + @file qurt_mq.h + + @brief Prototypes of secure message queues API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2019-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. +======================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_MQ_NAME_MAXLEN 16U /**< Maximum name length. */ + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/* This enum must be generated in accordance to process class class numbers. + For now it is made to match generated version, do not change this unless + there is a corresponding change in the process_class.py, indicies start from 0 + basically: QURT_MQ_SECURITY_SCOPE_ = (1 << QURTK_process_class_index_) +*/ +typedef enum { + QURT_MQ_SECURITY_SCOPE_KERNEL = ( 1U << 0 ), + QURT_MQ_SECURITY_SCOPE_SRM = ( 1U << 1 ), + QURT_MQ_SECURITY_SCOPE_SECURE = ( 1U << 2 ), + QURT_MQ_SECURITY_SCOPE_CPZ = ( 1U << 3 ), + QURT_MQ_SECURITY_SCOPE_ROOT = ( 1U << 4 ), + QURT_MQ_SECURITY_SCOPE_SIGNED = ( 1U << 5 ), + QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ), + QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 ) +} qurt_mq_security_scope_t; + +typedef enum { + QURT_MQ_CARDINALITY_PTP = (1U << 0), + QURT_MQ_CARDINALITY_MTO = (1U << 1) +}qurt_mq_cardinality_t; + +typedef unsigned int qurt_mqd_t; + +typedef union{ + struct { + unsigned int perms:2; + unsigned int cardinality:1; + unsigned int blocking:1; + + qurt_mq_security_scope_t creator_scope: 8; + qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO + unsigned int queue_closed: 1; + unsigned int reserved: 11; + }; //try to do anonymous struct + unsigned int raw; +} qurt_mq_flags_t; + + +/* permissions are from qurt_types.h , block X though */ +#if 0 +/** Memory access permission. */ +typedef enum { + QURT_PERM_READ=0x1U, /**< */ + QURT_PERM_WRITE=0x2U, /**< */ + QURT_PERM_EXECUTE=0x4U, /**< */ + QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE, /**< */ +} qurt_perm_t; +#endif + +struct qurt_mq_attr { + unsigned flags; /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */ + unsigned mq_maxmsg; /**< Maximum number of messages. Used with create() and get_attr. */ + unsigned short mq_send_msgsize; /**< Maximum size (bytes) of message in receiver facing queue, + from sender to receiver. */ + unsigned short mq_recv_msgsize; /**< Maximum size (bytes) of message in sender facing queue, + from receiver to sender. */ + unsigned client_pid; /**< Process ID of client that is allowed to open the message queue + that was created using qurt_mq_create(). */ + qurt_mq_cardinality_t cardinality; /**< Cardinality of message queue connection, see below. */ + qurt_mq_security_scope_t scope; /**< Security scope of the senders to the queue. */ +}; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mq_attr_init + Initializes attributes to default values used for creating the queue. + + The initialize operation sets the following default attribute values: \n + - flag - QURT_PERM_READ | QURT_PERM_WRITE \n + - maxmsg - 1 \n + - mq_send_msgsize - 8 \n + - mq_recv_msgsize - 8 \n + - sender_pid - -1 \n + - cardinality - QURT_MQ_CARDINALITY_PTP \n + - scope - QURT_MQ_SECURITY_SCOPE_SIGNED \n + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the initialized message queue object. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_init(struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_attr_set_send_msgsize + Sets the message size in bytes the sender can send. + Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_recv_msgsize + Sets the message size in bytes that the receiver can read. + Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_maxmsg + Sets the maximum message that can queue in the message queue. + Message depth is configurable using the XML configuration. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] depth Maximum message that can be queued. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth); + +/**@ingroup qurt_mq_attr_set_scope + Sets the scope of the message queue. A message queue created with a security + scope allows only a process class of that scope to open a message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mq_security_scope_t + + @param[in,out] attr Pointer to the message queue object. + @param[in] scope Scope of the message queue: \n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope); + + +/**@ingroup qurt_mq_attr_set_client_pid + Sets the client_pid that can open this message queue. + If client_pid is set, allowed_scope to open MQ shall not be considered. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] client_pid Valid PID for client process. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid); + +/**@ingroup qurt_mq_attr_set_flags + Sets the properties of the message queues. + The current implementation is only used to set the permission for the message queue using the flag attribute. + Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] flags Permission for message queue. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags); + +/**@ingroup qurt_mq_create + Create a message queue with the provided name and attributes. + The calling process becomes the owner of the queue. + Name of the message queue is limited to 16 characters including the NULL terminator. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue identifier if + the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] attr Pointer to the initialized message queue attribute + structure that specifies the attributes of the created message queue. + + @return + #QURT_EOK Message queue created. \n + #QURT_EINVALID Invalid arguments. \n + #QURT_ENOSPC Maximum number of queues in the system is exceeded. + + @dependencies + None. +*/ +int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_open + Opens a message queue connection between a process and a created message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue + identifier if the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] flags Flag that contains the properties that define the behavior of message queue connection. + Permissions:\n + #QURT_PERM_READ \n + #QURT_PERM_WRITE \n + #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend + Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n + Cardinality: \n + #QURT_MQ_CARDINALITY_PTP (default) \n + #QURT_MQ_CARDINALITY_MTO (not implemented) \n + Block suspend thread until the message queue with the apecified name is created. \n + Scope: security boundary to which the message queue and its users are constrained. + Block suspend thread until the message queue with the apecified name is created. \n + It is coupled with process privilege level/scope.\n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend + + @return + QURT_EOK -- Message queue connection successfully opened \n + QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n + QURT_ENOTALLOWED -- Open failed due to security scope mismatch + + @dependencies + None. +*/ +int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags); + +/**@ingroup qurt_mq_send + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send shall resume that thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] msg_len Length of the message buffer in bytes. + + @return + #QURT_EOK Message queue send was successful.\n + #QURT_EMSGSIZE Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED Send failed due to security scope mismatch. + + @dependencies + None. +*/ +int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); + +/**@ingroup qurt_mq_send_timed + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message or until timeout is reached. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] duration Interval (in microseconds) that the duration value must be + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in] msg_len Length of message buffer in bytes. + + @return + #QURT_EOK -- Message queue send was successful. \n + #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED -- Send failed due to security scope mismatch \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. +*/ +int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len); + + /**@ingroup qurt_mq_recv + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue. \n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv shall resume the thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in,out] msg_len Pointer to the length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID Message pointer or msg_len ptr are NULL. \n + #QURT_EBADR Message queue descriptior (mqd) is invalid. \n + #QURT_EBADF Sender closed the message queue. + + @dependencies + None. +*/ +int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len); + + /**@ingroup qurt_mq_recv_timed + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue or until timeout is reached.\n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in] duration Interval (in microseconds) that the duration value must be; + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in,out] msg_len Pointer to length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID -- Message ptr or msg_len ptr are NULL. \n + #QURT_EBADR -- Message queue descriptior (mqd) is invalid.\n + #QURT_EBADF -- Sender closed the message queue. \n + #QURT_ETIMEDOUT -- Timeout. + + @dependencies + None. +*/ +int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len); + + /**@ingroup qurt_mq_close + Closes the message queue and disassociates the calling process (client) from the message queue + under this descriptor. Marks the queue as closed for the receiver. + This function is expected to be called from the client side. If called + from the server side, the function reduces to no-op and returns success. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue close was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_close(qurt_mqd_t mqd); + + /**@ingroup qurt_mq_destroy + Destroys the message queue. This function ought to be + called from the process that called qurt_mq_create(). + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue destroy was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_destroy(qurt_mqd_t mqd); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif //QURT_MQ_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mutex.h new file mode 100755 index 0000000000000..4ad6b270cdde6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mutex.h @@ -0,0 +1,211 @@ +#ifndef QURT_MUTEX_H +#define QURT_MUTEX_H +/** + @file qurt_mutex.h + @brief Prototypes of mutex API. + This is mostly a user space mutex, but calls the + kernel to block if the mutex is taken. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT mutex type. + + Both non-recursive mutex lock and unlock, and recursive + mutex lock and unlock can be applied to this type. + */ +typedef union qurt_mutex_aligned8{ + /** @cond */ + struct { + unsigned int holder; + unsigned int count; + unsigned int queue; + unsigned int wait_count; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_mutex_t; +/** @} */ /* end_addtogroup mutex_types */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* @addtogroup mutex_const_macros +@{ */ +#define MUTEX_MAGIC 0xfe /**< */ +#define QURTK_FUTEX_FREE_MAGIC 0x1F // 11111 /**< */ +#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}} /**< Suitable as an initializer for a + variable of type qurt_mutex_t. */ +/* @} */ /* end_addtogroup mutex_const_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mutex_init + Initializes a mutex object. + The mutex is initially unlocked. + + @note1hang Each mutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_mutex_destroy() + when this object is not used anymore + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the mutex object. Returns the initialized object. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_destroy + Destroys the specified mutex. + + @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_lock + Locks the specified mutex. + If a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. + + @note1hang A thread is suspended indefinitely if it locks a mutex that it has already + locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}). + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_lock(qurt_mutex_t *lock); /* blocking */ + +/**@ingroup func_qurt_mutex_lock_timed + Locks the specified mutex. + When a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + When a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. If the duration of suspension exceeds the timeout duration, wait is + terminated and no access to mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object; specifies the mutex to lock. + @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration); + +/**@ingroup func_qurt_mutex_unlock + Unlocks the specified mutex. \n + More than one thread can be suspended on a mutex. When the mutex is unlocked, only the + highest-priority thread waiting on the mutex is awakened. If the awakened thread has + higher priority than the current thread, a context switch occurs. + + @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first + lock. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to unlock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_unlock(qurt_mutex_t *lock); /* unlock */ + +/**@ingroup func_qurt_mutex_try_lock + @xreflabel{hdr:qurt_mutex_try_lock} + Attempts to lock the specified mutex. + If a thread performs a try_lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + @note1hang If a thread performs a try_lock operation on a mutex that it has already locked + or is in use by another thread, qurt_mutex_try_lock immediately returns with a + nonzero result value. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_mutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_os_services.h new file mode 100755 index 0000000000000..cbc4c239e9620 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_os_services.h @@ -0,0 +1,24 @@ +/*============================================================================= + + qurt_os_services.c + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ + +#define QURT_OS_SERVICE_THREAD "/os/thread" /**< Thread service */ +#define QURT_OS_SERVICE_FS_HUB "/os/fs_hub" /**< file-system hub */ +#define QURT_OS_SERVICE_CALLBACK "/os/callback" /**< QDI callback service */ +#define QURT_OS_SERVICE_INTERRUPTS "/os/interrupt" /**< Interrupt service */ +#define QURT_OS_SERVICE_PROXY "/os/proxy" /**< QDI proxy serice */ +#define QURT_OS_SERVICE_MEMORY "/os/memory" /**< Memory management service */ +#define QURT_OS_SERVICE_MEMPOOL "/os/mempool" /**< Pool management service */ +#define QURT_OS_SERVICE_PROCESS "/os/process" /**< Process management service */ +#define QURT_OS_SERVICE_MMAP "/os/mem_mapper" /**< mmapper service */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex.h new file mode 100755 index 0000000000000..61aee5cba7ce8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_PIMUTEX_H +#define QURT_PIMUTEX_H 1 +/** + @file qurt_pimutex.h + @brief Prototypes of qurt_pimutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pimutex_init + Initializes a priority inheritance mutex object. + The priority inheritance mutex is initially unlocked. + + This function works the same as qurt_mutex_init(). + + @note1hang Each pimutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_pimutex_destroy() + when this object is not used anymore + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the priority inheritance mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_destroy + Destroys the specified priority inheritance mutex. + + @note1hang Priority inheritance mutexes must be destroyed when they are no longer in + use. Failure to do this causes resource leaks in the QuRT kernel.\n + @note1cont Priority inheritance mutexes must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_lock + Requests access to a shared resources. If a thread performs a lock operation on a mutex + that is not in use, the thread gains access to the shared resource that the mutex protects, + and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + If a thread is suspended on a priority inheritance mutex, and the priority of the suspended + thread is higher than the priority of the thread that has locked the mutex, the thread + with the mutex acquires the higher priority of the suspended thread. The locker thread blocks + until the lock is available. + + @note1hang A thread is not suspended if it locks a priority inheritance mutex that it has + already locked . However, the mutex does not become available to other + threads until the thread performs a balanced number of unlocks on the mutex.\n + @note1cont When multiple threads compete for a mutex, the lock operation for a priority + inheritance mutex is slower than it is for a recursive mutex. + In particular, it is about 10 times slower when the mutex is available for locking, + and slower (with greatly varying times) when the mutex is already locked. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_lock(qurt_mutex_t *lock); + + +/**@ingroup func_qurt_pimutex_lock_timed + Locks a priority inheritance mutex with timeout. + + A thread can lock a priority inheritance mutex for multiple times. The mutex is not + available to other threads until the thread performs the same number of mutex unlock + operations. + + If a thread performs a lock operation on a mutex that is already locked by another thread, + the thread is moved to waiting state. When the mutex becomes available again (because the + other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex. + + If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread + is higher than the priority of the thread that has locked the mutex, the priority of the thread + that has locked the mutex is raised to the same priority of the waiting thread. + + If the duration of waiting exceeds the timeout duration, the waiting is terminated, and + the function returns QURT_ETIMEDOUT as a failure of the mutex lock. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to lock. + @param[in] duration Duration (in microseconds) to wait. The duration value must be between + #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + #QURT_EINVALID -- Duration is out of range + + @dependencies + None. + + */ +int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + + +/**@ingroup func_qurt_pimutex_unlock + Releases access to a shared resource; unlocks the specified priority inheritance mutex. \n + More than one thread can be suspended on a priority inheritance mutex. When the mutex + is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + When a thread unlocks a priority inheritance mutex, its thread priority is restored to its + original value from any higher priority value that it acquired from another thread + suspended on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_try_lock + Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n + If a thread performs a try_lock operation on a priority inheritance mutex that is not in + use, the thread gains access to the shared resource that is protected by the mutex, and + continues executing. + If a thread performs a try_lock operation on a priority inheritance mutex that is already + in use by another thread, qurt_pimutex_try_lock immediately returns with a + nonzero result value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_pimutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex2.h new file mode 100755 index 0000000000000..b809f163cbfd2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex2.h @@ -0,0 +1,162 @@ +#ifndef QURT_PIMUTEX2_H +#define QURT_PIMUTEX2_H +/** + @file qurt_pimutex2.h + @brief Prototypes of pimutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_pimutex2_init + Initializes a recursive mutex object. + + @deprecated use #qurt_pimutex_init instead. + + The recursive mutex is initially unlocked. + + Objects of type pimutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_destroy + + @deprecated use #qurt_pimutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1cont Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code should destroy an pimutex2 object prior to + deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures + that all qurt_pimutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_lock + + @deprecated use #qurt_pimutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not being used, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing. + + If a thread performs a lock operation on a recursive mutex that is already being used by + another thread, the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_unlock + + @deprecated use #qurt_pimutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_pimutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_pimutex2_lock(). If a call to qurt_pimutex2_lock() would + succeed immediately, this function behaves similarly, and returns 0 for success. + If a call to qurt_pimutex2_lock() would not succeed immediately, this function has + no effect and returns non-zero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pipe.h new file mode 100755 index 0000000000000..6bdaa044f8640 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pipe.h @@ -0,0 +1,479 @@ +#ifndef QURT_PIPE_H +#define QURT_PIPE_H +/** + @file qurt_pipe.h + @brief Prototypes of the pipe interface API + This is a pipe or message queue + It blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup pipe_types +@{ */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_PIPE_MAGIC 0xF1FEF1FE /**< Magic. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */ + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** QuRT pipe data values type. */ +typedef unsigned long long int qurt_pipe_data_t; + +/** QuRT pipe type.*/ +typedef struct { + /** @cond */ + qurt_mutex_t pipe_lock; + qurt_sem_t senders; + qurt_sem_t receiver; + unsigned int size; + unsigned int sendidx; + unsigned int recvidx; + void (*lock_func)(qurt_mutex_t *); + void (*unlock_func)(qurt_mutex_t *); + int (*try_lock_func)(qurt_mutex_t *); + void (*destroy_lock_func)(qurt_mutex_t *); + unsigned int magic; + qurt_pipe_data_t *data; + /** @endcond */ +} qurt_pipe_t; + +/** QuRT pipe attributes type. */ +typedef struct { + /** @cond */ + qurt_pipe_data_t *buffer; + unsigned int elements; + unsigned char mem_partition; + /** @endcond */ +} qurt_pipe_attr_t; + +/** @} */ /* end_addtogroup pipe_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pipe_attr_init + @xreflabel{hdr:qurt_pipe_attr_init} + Initializes the structure that sets the pipe attributes when a pipe is created. + + After an attribute structure is initialized, the individual attributes in the structure are + explicitly set using the pipe attribute operations. + + The attribute structure is assigned the following default values: \n + - buffer -- 0 \n + - elements -- 0 \n + - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr) +{ + attr->buffer = NULL; + attr->elements = 0; + attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer + @xreflabel{sec:qurt_pipe_attr_set_buffer} + Sets the pipe buffer address attribute.\n + Specifies the base address of the memory area to use for the data buffer of a pipe. + + The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the + memory area used as a pipe data buffer. The user is responsible for allocating the + memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t \n + #qurt_pipe_data_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] buffer Pointer to the buffer base address. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer) +{ + attr->buffer = buffer; +} + +/**@ingroup func_qurt_pipe_attr_set_elements + @xreflabel{sec:qurt_pipe_attr_set_elements} + Specifies the length of the memory area to use for the data buffer of a pipe. + + The length is expressed in terms of the number of 64-bit data elements that + can be stored in the buffer. + + The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify + the memory area used as a pipe data buffer. The user is responsible for + allocating the memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] elements Pipe length (64-bit elements). + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements) +{ + attr->elements = elements; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer_partition + @xreflabel{sec:qurt_pipe_attr_set_buffer_partition} + Specifies the memory type where a pipe's buffer is allocated. + Allocate pipes in RAM or TCM/LPM. + + @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created + with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] mem_partition Pipe memory partition. Values: \n + - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n + - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition) +{ + attr->mem_partition = mem_partition; +} + +/**@ingroup func_qurt_pipe_create + Creates a pipe.\n + Allocates a pipe object and its associated data buffer, and initializes the pipe object. + + @note1hang The buffer address and size stored in the attribute structure specify how the + pipe data buffer is allocated. + + @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created + using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the created pipe object. + @param[in] attr Pointer to the attribute structure used to create the pipe. + + @return + #QURT_EOK -- Pipe created. \n + #QURT_EFAILED -- Pipe not created. \n + #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM. + + @dependencies + None. + */ +int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_init + Initializes a pipe object using an existing data buffer. + + @note1hang The buffer address and size stored in the attribute structure must + specify a data buffer that the user has already allocated. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the pipe object to initialize. + @param[in] attr Pointer to the pipe attribute structure used to initialize the pipe. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Failure. + + @dependencies + None. + */ +int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_destroy + @xreflabel{sec:qurt_pipe_destroy} + Destroys the specified pipe. + + @note1hang Pipes must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel. + Pipes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_destroy(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_delete + Deletes the pipe.\n + Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its + associated data buffer. + + @note1hang Delete pipes only if they were created using qurt_pipe_create + (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n + @note1cont Pipes must be deleted when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Pipes must not be deleted while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_delete(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_send + Writes a data item to the specified pipe. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + None. + + @dependencies + None. +*/ +void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_receive + Reads a data item from the specified pipe. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + Integer containing the 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_try_send + Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n + + If a thread writes to a full pipe, the operation returns immediately with success set to -1. + Otherwise, success is always set to 0 to indicate a successful write operation. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + 0 -- Success. \n + -1 -- Failure (pipe full). + + @dependencies + None. +*/ +int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_try_receive + Reads a data item from the specified pipe (without suspending the thread if the pipe is + empty).\n + If a thread reads from an empty pipe, the operation returns immediately with success set + to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n + + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[out] success Pointer to the operation status result. + + @return + Integer containing a 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success); + +/**@ingroup func_qurt_pipe_receive_cancellable + Reads a data item from the specified pipe (with suspend), cancellable. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + The operation is cancelled if the user process of the calling thread is killed, + or if the calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] result Pointer to the integer containing the 64-bit data item from pipe. + + @return + #QURT_EOK -- Receive completed. \n + #QURT_ECANCEL -- Receive canceled. \n + #QURT_EDESTROY -- Receive destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result); + +/**@ingroup func_qurt_pipe_send_cancellable + @xreflabel{hdr:qurt_pipe_send_cancellable} + Writes a data item to the specified pipe (with suspend), cancellable. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + The operation is canceled if the user process of the calling thread is killed, or if the + calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] data Data item to write. + + @return + #QURT_EOK -- Send completed. \n + #QURT_ECANCEL -- Send canceled. \n + #QURT_EDESTROY -- Send destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_is_empty + Returns a value indicating whether the specified pipe contains any data. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + 1 -- Pipe contains no data. \n + 0 -- Pipe contains data. + + @dependencies + None. +*/ +int qurt_pipe_is_empty(qurt_pipe_t *pipe); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIPE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmem_manager.h new file mode 100755 index 0000000000000..8c8da985228b9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmem_manager.h @@ -0,0 +1,82 @@ +#ifndef QURT_PMEM_MANAGER_H +#define QURT_PMEM_MANAGER_H +/** + @file qurt_pmem_manager.h + Prototypes of kernel physical memory manager APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* physical memory API return error code */ +#define QURT_PMEM_SUCCESS 0 +#define QURT_PMEM_NO_PRIV 1 +#define QURT_PMEM_RETRY 2 +#define QURT_PMEM_OVERLAP 3 +#define QURT_PMEM_NOT_EXIST 4 +#define QURT_PMEM_INIT_FAILURE 5 +#define QURT_PMEM_OUTSTANDING_MAPPING 6 +#define QURT_PMEM_GENERIC_FAILURE 7 +#define QURT_PMEM_ENTRY_FOUND 8 +#define QURT_PMEM_REACH_END 9 +#define QURT_PMEM_UNCLAIMED 10 +#define QURT_PMEM_ALREADY_CLAIMED 11 + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_pmem_acquire + Acquire the ownership of a specific physical memory region. + + @note1hang The ownership will be the caller + + @param[in] ppage Starting physical page number + @param[in] pnum Number of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. +*/ +int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum); + +/**@ingroup func_qurt_pmem_release + Release the ownership of a specific physical memory region. + + @param[in] ppage The start of physical page number + @param[in] pnum The numbers of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_NOT_EXIST -- The physical memory range is not usable. \n + #QURT_PMEM_OUTSTANDING_MAPPING -- There is outstanding mapping in this range + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. + */ +int qurt_pmem_release(unsigned int ppage, unsigned int pnum); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMEM_MANAGER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmu.h new file mode 100755 index 0000000000000..73ea8eba04abf --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmu.h @@ -0,0 +1,121 @@ +#ifndef QURT_PMU_H +#define QURT_PMU_H +/** + @file qurt_pmu.h + Prototypes of pipe interface API. + A pipe or message queue blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_pmu_set + Sets the value of the specified PMU register. + + @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0 + through PMUCNT3. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @param[in] reg_value Register value. + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_set (int reg_id, unsigned int reg_value); + +/**@ingroup func_qurt_pmu_get + Gets the PMU register.\n + Returns the current value of the specified PMU register. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @return + Integer -- Current value of the specified PMU register. + + @dependencies + None. + */ +unsigned int qurt_pmu_get (int reg_id); + +/**@ingroup func_qurt_pmu_enable + Enables or disables the Hexagon processor PMU. + Profiling is disabled by default. + + @note1hang Enabling profiling does not automatically reset the count registers -- this must + be done explicitly before starting event counting. + + @param[in] enable Performance monitor. Values: \n + - 0 -- Disable performance monitor \n + - 1 -- Enable performance monitor @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_enable (int enable); + +/**@ingroup func_qurt_pmu_get_pmucnt + Reads PMU counters in a single trap. + + @param[out] buf Pointer to a buffer to save values read from PMU counters. + buffer size should be at least 32 bytes to read all eight PMU counters. + + @return + #QURT_EOK -- Successful read.\n + #QURT_EFATAL -- Failure. + + @dependencies + None. + */ +int qurt_pmu_get_pmucnt (void * buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMU_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_power.h new file mode 100755 index 0000000000000..2ee4d29a73976 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_power.h @@ -0,0 +1,140 @@ +#ifndef QURT_POWER_H +#define QURT_POWER_H +/** + @file qurt_power.h + @brief Prototypes of power API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/*============================================================================= + + EDIT HISTORY FOR MODULE + + This section contains comments describing changes made to the module. + Notice that changes are listed in reverse chronological order. + + +when who what, where, why +-------- --- ------------------------------------------------------------ +03/03/11 op Add header file +12/12/12 cm (Tech Pubs) Edited/added Doxygen comments and markup. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +/**@ingroup func_qurt_power_shutdown_fail_exit + Returns from Power Collapse mode when power collapse cannot proceed. + + This function unmasks the global interrupt. This operation is used only when the thread is + recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}). + + @return + #QURT_EOK -- Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_fail_exit qurt_power_exit + +/**@ingroup func_qurt_power_shutdown_exit + Undoes state changes made preparing for power collapse.\n + This function unmasks the global interrupts. + + @return + #QURT_EOK --Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_exit qurt_power_exit +/**@endcond */ + +/**@ingroup func_qurt_system_ipend_get + Gets the IPEND register.\n + + @note1hang Returns the current value of the Hexagon processor IPEND register. The return value + is a mask value that identifies the individual interrupts that are pending. \n + + @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A + mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the + corresponding interrupt is not pending. \n + + @return + Return the IPEND register value. + + @dependencies + None. + */ +unsigned int qurt_system_ipend_get (void); + + +/**@ingroup func_qurt_system_vid_get + Gets the VID register. \n + + @note1hang Returns the current value of the Hexagon processor VID register. The return value is + the vector number of a second-level interrupt that has been accepted by the Hexagon + processor core.\n + + @return + Return the VID register value that is the L2 VIC interrupt number accepted by the processor. + Valid range is 0 to 1023. + + @dependencies + None. + */ +unsigned int qurt_system_vid_get(void); + +/**@ingroup func_qurt_power_shutdown_get_pcycles + Gets the number of power collapses and processor cycles for entering and exiting most recent + power collapse. + + @note1hang If no power collapse has occured yet, processor cycle numbers are zero. + + @param[out] enter_pcycles Number of processor cycles for entering most + recent power collapse. + @param[out] exit_pcycles Number of processor cycles for exiting most + recent power collapse. + @return + Zero -- No power collapses have occurred. \n + Nonzero -- Number of power collapses that have occurred since + the processor was reset. + + @dependencies + None. + */ +int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles, unsigned long long *exit_pcycles ); + +/**@ingroup func_qurt_system_tcm_set_size + Set size of TCM to save during full power collapse. + + @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in + XML, the size is truncated to the size defined in XML. + + @param[in] new_size Size of TCM to save. + + @return + Zero -- Size successfully set \n + -1 -- Size of 0 passed + + @dependencies + None. + */ +int qurt_system_tcm_set_size(unsigned int new_size); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_POWER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_printf.h new file mode 100755 index 0000000000000..a775d8a815918 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_printf.h @@ -0,0 +1,44 @@ +#ifndef QURT_PRINTF_H +#define QURT_PRINTF_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + @file qurt_printf.h + Prototypes of printf API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup chapter_function_tracing +@{ */ + +int qurt_printf(const char* format, ...); + +int qurt_vprintf(const char* format, va_list args); + +/** @} */ /* end_addtogroup chapter_function_tracing */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PRINTF_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_process.h new file mode 100755 index 0000000000000..0df9ddc2d4a70 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_process.h @@ -0,0 +1,995 @@ +#ifndef QURT_PROCESS_H +#define QURT_PROCESS_H +/** + @file qurt_process.h + @brief Prototypes of QuRT process control APIs. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_callback.h" +#include "qurt_consts.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup process_types +@{ */ +#define QURT_PROCESS_ATTR_NAME_MAXLEN QURT_MAX_NAME_LEN /**< Maximum length of the process name. */ +#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN 128 /**< Maximum length of the path of binary/ELF for this process. */ +#define QURT_PROCESS_ATTR_CAP_MAXLEN 128 /**< Maximum length for a resource name. */ + +/** QuRT process capability wildcard strings */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL "ALLOW_ALL" /**< Capability wild-card for full access */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE "ALLOW_NONE" /**< Capability wild-card for no access */ + +/** QuRT process capability states */ +#define QURT_PROCESS_ATTR_CAP_ENABLED 0x1 /**< Capability enabled*/ +#define QURT_PROCESS_ATTR_CAP_DISABLED 0x0 /**< Capability disabled*/ + +/* QuRT process thread attributes. */ +#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0 /**< Default ceiling priority of the threads in the new process. */ +#define QURT_PROCESS_DEFAULT_MAX_THREADS -1 /**< Default number of threads in the new process. + -1 indicates that the limit is set to the maximum supported by the system. */ + +/* QuRT process flags. */ +#define QURT_PROCESS_SUSPEND_ON_STARTUP (1U) /**< Suspend the new processes just before calling main(). */ +#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */ +#define QURT_PROCESS_ISLAND_RESIDENT (1u << 2) /**< Process is island resident. */ +#define QURT_PROCESS_RESTARTABLE (1u << 3) /**< Indicates that the process is restartable */ +#define QURT_PROCESS_UNTRUSTED (1u << 7) /**< Starts the new process as unsigned process. */ + +/* QuRT process debugging session status.*/ +#define QURT_DEBUG_NOT_START 0 /**< Debug is not started. */ +#define QURT_DEBUG_START 1 /**< Debug has started. */ + +/** Process Suspend Options */ +#define QURT_PROCESS_SUSPEND_DEFAULT 0 + +/** Process Resume Options */ +#define QURT_PROCESS_RESUME_DEFAULT 0 + + +/* QuRT process types. */ +typedef enum { + QURT_PROCESS_TYPE_RESERVED, /**< Process type is reserved. \n */ + QURT_PROCESS_TYPE_KERNEL, /**< Kernel process. \n*/ + QURT_PROCESS_TYPE_SRM, /**< SRM process. \n*/ + QURT_PROCESS_TYPE_SECURE, /**< Secure process. \n*/ + QURT_PROCESS_TYPE_ROOT, /**< Root process. \n*/ + QURT_PROCESS_TYPE_USER, /**< User process. */ +}qurt_process_type_t; + +/** QuRT process callback types. */ +typedef enum { + QURT_PROCESS_DUMP_CB_ROOT, /**< Register the callback that executes in the + root process context. \n */ + QURT_PROCESS_DUMP_CB_ERROR, /**< Register the user process callback that is + called after threads in the process are frozen. \n */ + QURT_PROCESS_DUMP_CB_PRESTM, /**< Register the user process callback that is + called before threads in the process are frozen. \n*/ + QURT_PROCESS_DUMP_CB_MAX /**< Reserved for error checking. */ +}qurt_process_dump_cb_type_t; + +/** QuRT process dump attributes. */ +typedef struct _qurt_pd_dump_attr{ + /** @cond */ + unsigned int enabled; /**< Process dump is enabled. */ + const char *path; /**< Process dump path. */ + unsigned int path_len; /**< Length of process dump path. */ + /** @endcond */ +}qurt_pd_dump_attr_t; + +/** QuRT process capability resource type */ +enum qurt_process_cap_type_t { + QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0, /**< Number of entries in the capability structure*/ + QURT_PROCESS_CAP_TYPE_DRIVER=1, /**< Driver resource */ + QURT_PROCESS_CAP_TYPE_MAX /**< Maximum identifier */ +}; + +/** QuRT process capability structure */ +typedef struct _qurt_capability { + enum qurt_process_cap_type_t type; /**< Resource type */ + char name[QURT_PROCESS_ATTR_CAP_MAXLEN]; /**< Resource name*/ + unsigned long long cap; /**< Capabilities allowed for this resource */ +}qurt_capability_t; + +/** QuRT process attributes. */ +typedef struct _qurt_process_attr { + /** @cond */ + char name[QURT_PROCESS_ATTR_NAME_MAXLEN]; /**< Name of the new process. */ + char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the binary for the new process. */ + char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the DTB ELF for the new process. */ + int flags; /**< Flags as indicated by QuRT process flags. */ + unsigned int sw_id; /**< Software ID of the process be load. */ + unsigned sid; /**< Stream ID of the process being spawned. */ + unsigned max_threads; /**< Maximum number of threads that the new process can create. */ + unsigned short ceiling_prio; /**< Maximum priority at which threads can be + created by new process. */ + qurt_process_type_t type; /**< Process type as indicated by + #qurt_process_type_t. */ + qurt_pd_dump_attr_t dump_attr; /**< Process dump attributes for the new process + as indicated by #qurt_pd_dump_attr_t. */ + qurt_capability_t *capabilities; /**< Pointer to array of structure of type + qurt_capability_t */ + /** @endcond */ +} qurt_process_attr_t; + +/** @} */ /* end_addtogroup process_types */ + +/*============================================================================= +FUNCTIONS +=============================================================================*/ + /** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_create + Creates a process with the specified attributes, and starts the process. + + The process executes the code in the specified executable ELF file. + + @datatypes + #qurt_process_attr_t + + @param[out] attr Accepts an initialized process attribute structure, which specifies + the attributes of the created process. + + @return + Postive return value Indicates Process ID. + Negative return value Indicates any of follwoing error, + #-QURT_EPRIVILEGE -- Caller does not have privilege for this operation \n + #-QURT_EMEM -- Not enough memory to perform the operation \n + #-QURT_EFAILED -- Operation failed \n + #-QURT_ENOTALLOWED -- Operation not allowed \n + #-QURT_ENOREGISTERED -- Not registered \n + #-QURT_ENORESOURCE -- Resource exhaustion \n + #-QURT_EINVALID -- Invalid argument value + #QURT_EFATAL -- attr is NULL + + @dependencies + None. +*/ +int qurt_process_create (qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_get_id + Returns the process identifier for the current thread. + + @return + None. + + @dependencies + Process identifier for the current thread. +*/ +int qurt_process_get_id (void); +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_get_uid + Returns the user identifier for the current thread. + + @return + None. + + @dependencies + User identifier for the current thread. +*/ +int qurt_process_get_uid (void); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_init + Initializes the structure that sets the process attributes when a thread is created. + + After an attribute structure is initialized, the individual attributes in the structure can + be explicitly set using the process attribute operations. + + Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize + operation. + + @inputov{table_process_attribute_defaults} + + @datatypes + #qurt_process_attr_t + + @param[out] attr Pointer to the structure to initialize. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_process_attr_init (qurt_process_attr_t *attr) +{ + attr->name[0] = '\0'; + attr->path[0] = '\0'; + attr->dtb_path[0] = '\0'; + attr->flags = 0; + attr->sw_id = 0; + attr->sid = 0; + attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS; + attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO; + attr->type = QURT_PROCESS_TYPE_RESERVED; + attr->dump_attr.enabled = 0; + attr->dump_attr.path = NULL; + attr->dump_attr.path_len = 0; + attr->capabilities = NULL; +} + +/**@ingroup func_qurt_process_attr_set_executable + Sets the process name in the specified process attribute structure. + + Process names identify process objects that are already + loaded in memory as part of the QuRT system. + + @note1hang Process objects are incorporated into the QuRT system at build time. + + @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] name Pointer to the process name. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name); + +/**@ingroup func_qurt_process_attr_set_binary_path + Sets the binary path for the process loading in the specified process attribute structure. + + Path specifies the binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_dtb_path + Sets the DTB binary path for the process loading in the specified process attribute structure. + + Path specifies the DTB binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_flags +Sets the process properties in the specified process attribute structure. +Process properties are represented as defined symbols that map into bits +0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing +together the individual property symbols. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical. + This attribute will be used by error services, + to decide whether to kill user pd or whole subsystem. + QURT_PROCESS_ISLAND_RESIDENT Process will be marked as island resident. + QURT_PROCESS_RESTARTABLE Process will be marked as restartable. + QURT_PROCESS_UNTRUSTED Process will be marked as unsigned process. +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags) +{ + attr->flags = flags; +} +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_sid +Sets the process streamID in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sid streamID to set for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid) +{ + attr->sid = sid; +} +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_set_max_threads +Sets the maximum number of threads allowed in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] max_threads Maximum number of threads allowed for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads) +{ + attr->max_threads = max_threads; +} + +/**@ingroup func_qurt_process_attr_set_sw_id +Sets the software ID of the process to load in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sw_id Software ID of the process, used in authentication. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id) +{ + attr->sw_id = sw_id; +} + +/**@ingroup func_qurt_process_attr_set_ceiling_prio +Sets the highest thread priority allowed in the specified process attribute structure. +Refer qurt_thread.h for priority ranges. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] prio Priority. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio) +{ + attr->ceiling_prio = prio; +} +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_dump_status +Sets the process domain dump-enabled field in the process domain dump attributes. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] enabled 1 -- Process domain dump is collected \n + 0 -- Process domain dump is not collected + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled) +{ + attr->dump_attr.enabled = enabled; +} + +/**@ingroup func_qurt_process_attr_set_dump_path +Sets the process domain dump path and type. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] path Path where the process domain dumps must be saved. +@param[in] path_len Length of the path string. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len) +{ + attr->dump_attr.path = path; + attr->dump_attr.path_len = (unsigned int)path_len; +} + +/**@ingroup func_qurt_process_attr_set_capabilities +Sets list of capabilities available to this process. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] capabilities Pointer to array of structures of type qurt_capability_t defining + resources and capabilites + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities) +{ + attr->capabilities = capabilities; +} + +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_cmdline_get +Gets the command line string associated with the current process. +The Hexagon simulator command line arguments are retrieved using +this function as long as the call is made +in the process of the QuRT installation, and with the +requirement that the program runs in a simulation environment. + +If the function modifies the provided buffer, it zero-terminates +the string. It is possible that the function does not modify the +provided buffer, so the caller must set buf[0] to a NULL +byte before making the call. A truncated command line is returned when +the command line is longer than the provided buffer. + +@param[in] buf Pointer to a character buffer that must be filled in. +@param[in] buf_siz Size (in bytes) of the buffer pointed to by the buf argument. + +@return +None. + +@dependencies +None. +*/ +void qurt_process_cmdline_get(char *buf, unsigned buf_siz); + +/**@ingroup func_qurt_process_get_thread_count +Gets the number of threads present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of threads in the process indicated by PID, if positive value is obtained +Negative error code if failed include: + QURT_EFATAL - Invalid PID + -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID + +@dependencies +None. +*/ +int qurt_process_get_thread_count(unsigned int pid); + +/**@ingroup func_qurt_process_get_thread_ids +Gets the thread IDs for a process indicated by PID. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a user passed buffer that must be filled in with thread IDs. +@param[in] thread_num Number of thread IDs requested. + +@return +#QURT_EOK - Success +#QURT_EFATAL - Failed, ptr is NULL + +@dependencies +None. + */ +int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num); +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_dump_get_mem_mappings_count +Gets the number of mappings present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of mappings for the process indicated by the PID. + +@dependencies +None. +*/ +int qurt_process_dump_get_mem_mappings_count(unsigned int pid); + +/**@ingroup func_qurt_process_dump_get_mappings +Gets the mappings for a specified PID. + +@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a buffer that must be filled in with mappings. +@param[in] count Count of mappings requested. + +@return +Number of mappings filled in the buffer passed by the user. + +@dependencies +None. +*/ +int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_get +Gets the attributes of the process with which it was created. + +@datatypes +#qurt_process_attr_t + +@param[in] pid PID of the process for which the information is required. +@param[in,out] attr Pointer to the user allocated attribute structure. + +@return +#QURT_EOK - Success +#QURT_INVALID - Invalid PID +#QURT_EFATAL - attr is NULL + +@dependencies +None. +*/ +int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_dump_register_cb +Registers the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information. +@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n + #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n + #QURT_PROCESS_DUMP_CB_ERROR -- After threads are frozen and captured. \n + #QURT_PROCESS_DUMP_CB_ROOT -- After threads are frozen and captured, and CB_ERROR type of callbacks + are called. +@param[in] priority Priority. + +@return +#QURT_EOK -- Success \n +Other values -- Failure + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority); + +/**@ingroup func_qurt_process_dump_deregister_cb +Deregisters the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information to deregister. +@param[in] type Callback type. + +@return +#QURT_EOK -- Success.\n +Other values -- Failure. + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type); + +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_set_rtld_debug +Sets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in] address rtld_debug address. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address); + +/**@ingroup func_qurt_process_get_rtld_debug +Gets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address); +/** @endcond */ +/**@ingroup func_qurt_process_exit +Exits the current user process with an exit code. + +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exit(int exitcode); + +/**@ingroup func_qurt_process_kill +Kills the process represented by the PID with the exit code. + +@param[in] pid PID of the process to kill. +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_kill(int pid, int exitcode); + + +/**@ingroup func_qurt_debugger_register_process +Registers the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. +@param[in] adr Address. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_register_process(int pid, unsigned int adr); + + +/**@ingroup func_qurt_debugger_deregister_process +Deregister the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_deregister_process(int pid); + +/**@ingroup func_qurt_process_exec_callback +Executes callbacks in the user process as indicated by the client_handle argument. + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] callback_fn Callback function to execute. +@param[in] stack_base Stack address to use. +@param[in] stack_size Stack size. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exec_callback(int client_handle, + unsigned callback_fn, + unsigned stack_base, + unsigned stack_size); + +/**@ingroup func_qurt_process_get_pid +Gets the process ID of the process that the client_handle argument represents. + +@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id() + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] pid Pointer to the address to store the PID. + +@return +#QURT_EOK -- Success +#QURT_EFATAL -- pid pointer passed as NULL + +@dependencies +None. +*/ +int qurt_process_get_pid(int client_handle, int * pid); + +/**@ingroup func_qurt_process_get_dm_status +Gets the debugging session status on the process represented by the pid argument. + +@param[in] pid Process ID +@param[in,out] status Address to store the status: \n + #QURT_DEBUG_NOT_START \n + #QURT_DEBUG_START + +@return +#QURT_EOK - Success \n +#QURT_EINVALID - Error + +@dependencies +None. +*/ +int qurt_process_get_dm_status( unsigned int pid, unsigned int *status); + + +/**@ingroup func_qurt_process_suspend_threads + Suspends user threads in a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in GuestOS/root process. + After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel + until they resume later. + + This function has one optional argument with one default option. + #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + This function call is a synchronous call, the function returns after the relevant threads are + completely suspended. + + If some user threads in the target user process are set as non-suspendable, this function call does + not suspend these threads. + + If the target user process is already suspended, this function call returns success as the + confirmation on the user process suspending. + + QuRT debugger monitor threads in the target user process are non-suspendable, this function call does + not suspend the threads. + + If the target user process is a secure user process, or a CPZ process, this function call returns error + without suspending the target user process. + + If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call + does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended + when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success while the user thread can be running in GuestOS, and is suspended + when exiting the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid process_id input \n + #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_suspend_threads (unsigned int process_id, unsigned int option); + + +/**@ingroup func_qurt_process_resume_threads + Resumes a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in the guest OS/root process. + After the user threads in the target user process resume, the kernel scheduler + can schedule the user threads to run based on their thread priorities. + + This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which + resumes user threads in the target user process. + + This is an asynchronous function, it returns after the kernel moves the user thread from + suspended state to runnable state. The threads are scheduled to run based on their thread priorities. + + This function call does not resume threads in the target user process that have been set as non-resumable. + + If the target user process have already resumed, this function call confirms that the user process resumes + by returning success. + + If the target user process is a secure user process or a CPZ process, this function call returns an error without + resuming operation. + + If user threads in the target user process run in the guest OS/root process via QDI call, this function + call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits + the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process. + + @return + #QURT_EOK -- Success + #QURT_EINVALID -- Failure because of invalid process_id input. + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_resume_threads (unsigned int process_id, unsigned int option); + +/**@ingroup func_qurt_process_vtcm_window_set + Set a VTCM access window for a process. + The caller thread needs to be in SRM process. + + This is an synchronous function, it ensures all running threads of the process have the requested + window in effect.The requested view for all non-running thread will take in effect when they get + scheduled. + + @param[in] pid Process identifier. + @param[in] enable QURT_VTCM_WINDOW_ENABLE enforces VTCM access window defined by high and low offset. + QURT_VTCM_WINDOW_DISABLE high and low offset is ignored and VTCM access is fully + disabled for the process. + @param[in] high_offset Specifies the high window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT restore high offset to reset value. + @param[in] low_offset Specifies the low window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value. + + @note1hang + when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT and low offset is set as + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled + via MMU mapping for the process. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset); + +/**@ingroup func_qurt_process_vtcm_window_get + Get the VTCM window for a process. + The caller thread needs to be in SRM process. + + + @param[in] pid Process identifier. + @param[out] enable address to store enable status if set + @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM + @param[out] low_offset address to return low window offset, in 4K increments, from the base address of the VTCM. + + @note1hang + User must first check the value of enable returned before checking high and low offset. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset); + +/**@ingroup func_qurt_process_set_group_config + Enable thread groups in the process with the ceiling priorities setup + + @param[in] process_id Process identifier. + @param[in] group_bitmask 64-bit mask of active thread groups + @param[in] ceiling_priorities array of ceiling priorities for thread group + + @note1hang + This API can only be called by root PD and can only be called once for each process, otherwise it will be + rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all + exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling + priority of group 0, it will be lowered to the ceiling value. + Examples 1: + group_bitmask = 0xD7; //'b11010111 + ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care + Exmaples 2: + group_mask = 0x5; //'b101 + ceiling_priorities[] = {240, 0, 20}; // 0 - does not care + + + @return + #QURT_EOK -- Success. + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_ENOTALLOWED -- The group has been configured already. + + @dependencies + None. + */ +int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask, + unsigned char *ceiling_priorities); + + +/**@ingroup func_qurt_process_stid_set + Set the specified stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[in] stid stid to be set + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level. + All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process. + When a non-default group_id is specified, the stid is set only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid); + +/**@ingroup func_qurt_process_stid_get + Get the stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[out] Pointer to a variable to return stid + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid. + When a non-default group_id is specified, the stid is returned only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_profile.h new file mode 100755 index 0000000000000..2a50c461440f6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_profile.h @@ -0,0 +1,98 @@ +#ifndef QURT_PROFILE_H +#define QURT_PROFILE_H +/** + @file qurt_profile.h + QuRT profiling support. + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup profiling_macros +@{ */ +#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */ +#define QURT_PROFILE_ENABLE 1 /**< Enable profiling. */ + +typedef unsigned int qurt_profile_param_t; + +#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */ + +/** @} */ /* end_addtogroup profiling_macros */ + +/** @addtogroup profiling_types + @{ */ +/** Profiling results. */ +typedef union +{ + /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME. */ + struct + { + unsigned int ticks; /**< Cumulative ticks the thread was ready. */ + } thread_ready_time; + +} qurt_profile_result_t; +/** @} */ /* end_addtogroup profiling_types */ + +/**@ingroup func_qurt_profile_enable2 + * Starts profiling of a specific parameter on a specific thread (as applicable). + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of the thread (if applicable) for which the specified + * paramter must be profiled. + * @param[in] enable #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- + * enable + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EALREADY -- Measurement already in progress or already stopped \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_enable2 ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + int enable +); + +/**@ingroup func_qurt_profile_get + * Gets the value of the profiling parameter that was previously enabled. + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of thread (if applicable) for which the specified + * profiling paramter must be retrieved. + * @param [out] result Profiling result associated with the parameter for the specified + * thread (if applicable). + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EFAILED -- Operation failed; profiling was not enabled \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_get ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + qurt_profile_result_t * result +); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ptrace.h new file mode 100755 index 0000000000000..622304dd92865 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ptrace.h @@ -0,0 +1,37 @@ +/*============================================================================= + + qurt_ptrace.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SYS_PTRACE_H__ +#define __SYS_PTRACE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +enum __ptrace_request +{ + /** + Indicates that the process making this request is requesting to be traced. + */ + PTRACE_TRACEME = 0, + PTRACE_EXT_IS_DEBUG_PERMITTED = 500 +}; + +long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data); + +#ifdef __cplusplus +} +#endif + +#endif //__SYS_PTRACE_H__ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi.h new file mode 100755 index 0000000000000..705408e5cfc6f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi.h @@ -0,0 +1,185 @@ +#ifndef QDI_H +#define QDI_H + +/** + @file qurt_qdi.h + @brief Prototypes of QuRT Driver Invocation API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_qdi_open + Opens the specified driver for subsequent operations. + qurt_qdi_open() is the primary mechanism by which a driver user can + obtain a QDI handle. The user provides the name of the driver to the + qurt_qdi_open call, and gets back a handle referencing + the named driver. \n + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_open function prototype is not actually defined as a varargs. + + + @param[in] p Driver name. + @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, + and should follow the POSIX open() convention. \n + - flags -- Optional second parameter (POSIX flags), the handle + access requested (read-only, write-only, or read-write, + for instance) and other flags such as whether the call + should create a new device or only open an existing + device. \n + - mode -- Optional third parameter (POSIX mode); permissions to + configure when a new device is created. @tablebulletend + + @return + Negative value -- Error. \n + Non-negative value -- Success, this result value serves as a handle to the + opened driver. + @dependencies + None. + */ +// int qurt_qdi_open(); +#define qurt_qdi_open(p,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__) + +#define qurt_qdi_open_dt(p,q,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__) + +/**@ingroup func_qurt_qdi_handle_invoke + Performs a generic driver operation, which (depending on the specified operation) can be + either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} + or a driver-specific operation. + The user provides a QDI handle and an integer + method number, along with 0 to 8 optional 32-bit arguments. + The device driver invocation function is invoked with the + same method number and 0 to 8 optional arguments. The + return value from the invocation function is passed back to + the user as the return value of qurt_qdi_handle_invoke. + + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_handle_invoke() function prototype is not actually defined as a + varargs function (and would break if it were defined this way). + + @param[in] h Driver handle. + @param[in] m Integer number for the operation to perform. + @param[in] ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n + arg1 -- First parameter \n + arg2 -- Second parameter \n + arg3 -- Third parameter \n + arg4 -- Fourth parameter \n + arg5 -- Fifth parameter \n + arg6 -- Sixth parameter \n + arg7 -- Seventh parameter \n + arg8 -- Eighth parameter + + @return + Integer value defined by the device driver. \n + -1 -- Error. + + @dependencies + None. + */ +// int qurt_qdi_handle_invoke(); +#define qurt_qdi_handle_invoke(h,m,...) \ + _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__) +#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c) +#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d)) +#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e)) +#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) +int qurt_qdi_qhi3(int,int,int); +int qurt_qdi_qhi4(int,int,int,int); +int qurt_qdi_qhi5(int,int,int,int,int); +int qurt_qdi_qhi6(int,int,int,int,int,int); +int qurt_qdi_qhi7(int,int,int,int,int,int,int); +int qurt_qdi_qhi8(int,int,int,int,int,int,int,int); +int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int); + +/**@ingroup func_qurt_qdi_write + Writes data to the specified driver. + A predefined invocation routine for drivers that + support a POSIX-like write functionality. + qqurt_qdi_write(handle, buf, len) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data to write is stored. + @param[in] len Number of bytes of data to write. + + @return + Non-negative integer -- Number of bytes written. \n + Negative error code -- Write could not take place. + + @dependencies + None. + */ +int qurt_qdi_write(int handle, const void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_read + User-visible API to read data from a QDI handle. + A predefined invocation routine for drivers that + support a POSIX-like read functionality. + qurt_qdi_read(handle, buf, len) is equivalent to: + qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data read is stored. + @param[in] len Number of bytes of data to read. + + @return + Non-negative integer number -- Bytes read. \n + Negative error code -- Read could not take place. + + @dependencies + None. + */ +int qurt_qdi_read(int handle, void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_close + Closes the specified driver, releasing any resources associated with the open driver. + User-visible API to close a QDI handle. + + This API should be called when the user is done using a + QDI-based handle. When this function is called, the driver can release + any resources held and perform other necessary cleanup + operations. qurt_qdi_close(handle) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle) + + @param[in] handle Driver handle. + + @return + 0 -- Success.\n + Negative error code -- Failure. + + @dependencies + None. + */ +int qurt_qdi_close(int handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_constants.h new file mode 100755 index 0000000000000..4866fada067f0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_constants.h @@ -0,0 +1,193 @@ +#ifndef QDI_CONSTANTS_H +#define QDI_CONSTANTS_H + +/** + @file qurt_qdi_constants.h + @brief Predefined invocation methods for drivers. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Method numbers used for QDI. +|| +|| Intended grouping of method numbers for QDI +|| including future usage: +|| +|| Method 0 should always be unused and not responded to by +|| any driver. +|| Methods 1 and 2 are reserved for name registration and +|| name lookup. +|| Methods 3 through 31 are reserved for POSIX-type operations +|| on open handles. +|| Methods 32 through 127 are reserved for the QDI infrastructure +|| and may be extended in the future to provide standard +|| driver debug services, management services, and system +|| notifications. +|| Methods 128 through 255 are reserved for the use of automatically +|| generated methods such as might be generated by an IDL (interface +|| definition language). The infrastructure may be extended to +|| perform services on these methods based on information provided +|| by the IDL, such as automatic buffer validation, etc. These +|| method numbers should not be used for any "ad hoc" methods. +|| Methods with number >= 256 are "private" method numbers that are +|| outside the scope of the QDI infrastructure. Drivers that want +|| to generate and consume their own "ad hoc" methods are free to +|| use these method numbers as they wish. The infrastructure does +|| not generate these method numbers or respond to them, but +|| passes them on unmolested. +|| +|| All driver implementations *should* return a value of +|| -1 when called with an unsupported method. The standard error +|| return value for POSIX APIs is -1, so we emulate that behavior +|| here. +*/ +/** @cond */ +#define QDI_UNUSED 0 +#define QDI_DEVNAME_REGISTER 1 +#define QDI_OPEN 2 +#define QDI_CLOSE 3 +#define QDI_READ 4 +#define QDI_WRITE 5 +#define QDI_IOCTL 6 +#define QDI_MMAP 7 +#define QDI_OS_FILEOPEN 8 +#define QDI_FLEN 9 +#define QDI_UNLINK 10 +#define QDI_FTELL 22 +#define QDI_SEEK 23 +#define QDI_FSTAT 24 + +#define QDI_FSNAME_REGISTER 150 +#define QDI_FS_OPEN 151 +#define QDI_MMAP2 153 +#define QDI_MPROTECT2 154 +#define QDI_MUNMAP2 155 + +#define QDI_CLIENT_HANDLE_OBJREF_GET 10 + +#define QDI_OS_PROCESS_LOAD 12 +#define QDI_OS_PROCESS_CHOOSE_ASID 13 + +#define QDI_OS_SET_GP 26 +#define QDI_CLIENT_HANDLE_CALLBACK 27 + +#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T 19 //reused +#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80 +#define QDI_CLIENT_HANDLE_HANDLE_RELEASE 81 +#define QDI_CLIENT_HANDLE_COPY_FROM_USER 82 +#define QDI_CLIENT_HANDLE_COPY_TO_USER 83 +#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE 86 +#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS 87 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK 41 +#define QDI_CLIENT_HLOSPOOL_INFO_GET 90 +#define QDI_CLIENT_HLOSPOOL2_INFO_GET 96 + +#define QDI_CLIENT_PID 44 +#define QDI_CLIENT_ASID QDI_CLIENT_PID + +#define QDI_OS_CLIENT_INFO_GET 48 + +#define QDI_OS_MEM_LOOKUP_PHYSADDR 57 + +#define QDI_OS_THREAD_ITERATOR_CREATE 68 +#define QDI_OS_THREAD_ITERATOR_NEXT 69 + +#define QDI_OS_SYSENV 78 + +#define QDI_REGION_USERMALLOC_INIT 180 // This method is for generic handle + + +#define QDI_CLIENT_HANDLE_USER_MALLOC 84 +#define QDI_CLIENT_HANDLE_USER_FREE 85 + +#define QDI_SIGNAL_GROUP_SIGNAL_CREATE 96 +#define QDI_SIGNAL_GROUP_WAIT 98 +#define QDI_SIGNAL_GROUP_POLL 99 +#define QDI_SIGNAL_SET 96 +#define QDI_SIGNAL_CLEAR 97 +#define QDI_SIGNAL_WAIT 98 +#define QDI_SIGNAL_POLL 99 + +#define QDI_OS_WAIT_FOR_MAIN_REAPER 104 + +#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL 105 +#define QDI_CLIENT_HANDLE_REFPROXY_ADD 106 +#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE 107 + +#define QDI_CLIENT_HANDLE_DETACH 116 + +#define QDI_OS_RESERVED1 139 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK2 142 + +#define QDI_DT_REGISTER 158 +#define QDI_OPEN_DEVICE 159 +#define QDI_OPEN_FROM_DT 160 + +#define QDI_PRIVATE 256 /* Method numbers beginning at 256 + are private method numbers, which + are device-specific and available + for use by device implementors. */ +/* +|| Permission bitmasks for use with qurt_qdi_lock_buffer(). +|| +|| Make sure these match with permission values from qurt_perm_t. +*/ +/** @endcond */ + +/** @addtogroup driver_support_constants +@{ */ +#define QDI_PERM_W 2 /**< Write access. */ +#define QDI_PERM_R 1 /**< Read access. */ +#define QDI_PERM_RW (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */ + +#define QDI_HANDLE_LOCAL_CLIENT 3 /**< Local client. */ +#define QDI_HANDLE_GENERIC 4 /**< Generic. */ + +#define QDI_REFCNT_BASE 0x510000 /**< */ +#define QDI_REFCNT_MAXED 0x51FFFD /**< */ +#define QDI_REFCNT_INIT 0x51FFFE /**< Driver object is temporary and is eventually deleted.*/ +#define QDI_REFCNT_PERM 0x51FFFF /**< Driver object is permanent and is never deleted. */ +/** @} */ /* end_addtogroup driver_support_constants */ + +/** @cond */ +/* +|| Flags used by process loaders. +*/ + +#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT 0x1 /* Set this flag to request the loaded process + to have island residency. */ +#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT 0x2 /* Set this flag to request the loaded process + to have root residency, for example, DL Pager. */ +/* +|| Constants used for qurt_event register API, type field. +*/ + +#define QURT_PROCESS_EXIT 1 + +/* +|| Constants used by QDI extensions. +*/ + +#define QURT_QDI_SINGLETON_TYPE_TRUE 0 +#define QURT_QDI_SINGLETON_TYPE_FALSE 1 +#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS 2 +/** @endcond */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QDI_CONSTANTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_driver.h new file mode 100755 index 0000000000000..e044e25f1bb72 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_driver.h @@ -0,0 +1,868 @@ +#ifndef QURT_QDI_DRIVER_H +#define QURT_QDI_DRIVER_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "stddef.h" +#include "qurt_qdi.h" +#include "qurt_types.h" +#include "qurt_callback.h" +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" +#include "qurt_mutex.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| This gives the canonical form for the arguments to a QDI +|| driver invocation function. The arguments are as follows: +|| +|| int client_handle (R0) QDI handle that represents the client +|| that made this QDI request. If the +|| client is remote, this is a +|| variable handle; if the client is local +|| (same thread and process), this is +|| set to QDI_HANDLE_LOCAL_CLIENT. +|| +|| qurt_qdi_obj_t *obj (R1) Points at the qdi_object_t structure +|| on which this QDI request is being made. +|| The qdi_object_t structure is usually +|| the first element of a larger structure +|| that contains state associated with the +|| object; because it is usually the first +|| element, the object pointers can be freely +|| interchanged through casts. +|| +|| int method (R2) Integer QDI method that represents +|| the request type. +|| +|| qurt_qdi_arg_t arg1 (R3) First three general purpose arguments +|| qurt_qdi_arg_t arg2 (R4) to the invocation function are passed in +|| qurt_qdi_arg_t arg3 (R5) these slots. +|| +|| qurt_qdi_arg_t arg4 (SP+0) Arguments beyond the first three are +|| qurt_qdi_arg_t arg5 (SP+4) passed on the stack. +|| qurt_qdi_arg_t arg6 (SP+8) +|| qurt_qdi_arg_t arg7 (SP+12) +|| qurt_qdi_arg_t arg8 (SP+16) +|| qurt_qdi_arg_t arg9 (SP+20) +|| +|| The canonical form of the invocation function takes a +|| total of 12 arguments, but not all of them are used. In general, +|| the QDI infrastructure only passes those arguments provided by +|| the caller; if the invocation function accesses additional +|| arguments beyond those provided by the caller, the values are not +|| useful. +*/ +/** @cond */ +#define QDI_INVOKE_ARGS \ + int, struct qdiobj *, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define QDI_EXT_INVOKE_ARGS \ + int, qurt_qdi_man_obj_t*, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define BUFFER_LOCK 1 +#define BUFFER_UNLOCK 0 + +struct qdiobj; +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef union { + void *ptr; /**< Pointer to the driver handle. */ + int num; /**< Method number. */ +} qurt_qdi_arg_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI driver version */ +typedef union { + int num; + struct { + short major; /** Driver major version number. */ + short minor; /** Driver minor version number. */ + }; +} qurt_qdi_version_t; + +typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS); +typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *); +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef struct qdiobj { + qurt_qdi_pfn_invoke_t invoke; /**< Invocation function that implements the driver methods.*/ + int refcnt; /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of + references to a driver instance. */ + qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance + of the driver object.*/ +} qurt_qdi_obj_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI managed object */ +typedef struct qurt_qdi_man_obj +{ + qurt_qdi_obj_t qdi_obj; + union + { + struct qurt_qdi_ext_driver * opener_obj; + struct qurt_qdi_ext_device * device_obj; + }; +}qurt_qdi_man_obj_t; + +typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS); +typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj); +typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device); + +typedef struct qurt_qdi_ext_obj_info{ + qurt_qdi_man_obj_t *obj; + int qdi_client_id; + struct qurt_qdi_ext_obj_info *next; +}qurt_qdi_ext_obj_info_t; +typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr; + +/** QuRT QDI device */ +//temporarily add this back while there are still drivers who statically define this structure +struct qurt_qdi_device { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; +}; +typedef struct qurt_qdi_device qurt_qdi_man_device; + +struct qurt_qdi_ext_driver { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + qurt_qdi_ext_pfn_create_device_t create_device; + qurt_qdi_version_t version; + qurt_qdi_ext_pfn_probe_t probe; + const char* compatible; + struct qurt_qdi_ext_device * device_list; + //qurt_qdi_ext_device_ptr device_list; +}; +typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t; +//above replaces qurt_qdi_man_device + +extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *); +extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *); + +extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS); +/** @endcond */ +/**@ingroup func_qurt_qdi_method_default + Processes a method that is unrecognized or unsupported in the driver invocation function. + All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded + to this function. + + @note1hang Invocation functions must process all unrecognized or unsupported methods + by calling this function. + + @return + None. + + @dependencies + None. +*/ +extern int qurt_qdi_method_default(QDI_INVOKE_ARGS); + +/**@ingroup func_qurt_qdi_handle_create_from_obj_t + Allocates a new device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[out] obj Pointer to the driver object. + + @return + Non-negative integer -- Success; this value is the new handle. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_invoke + Allocates a new island device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). + @param[in] obj Pointer. + + @return + Non-negative integer value that is the new handle -- Success. \n + Negative return value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_release + Deallocates the specified device handle. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] handle_to_release Handle to release. + + @return + 0 -- Success. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_RELEASE, + handle_to_release); +} + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_handle(int client_handle, int object_handle) +{ + qurt_qdi_obj_t *ret; + + ret = NULL; + + qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_OBJREF_GET, + object_handle, + &ret); + + return ret; +} + +/**@ingroup func_qurt_client_add_memory + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size); + +/**@ingroup func_qurt_client_add_memory2 + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting 36-bit address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size); + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr) +{ + qurt_qdi_obj_t * ret = NULL; + + if (qurt_qdi_obj_ref_inc(objptr) < 0) { + ret = NULL; + } else { + ret = objptr; + } + + return ret; +} + +static __inline void +qurt_qdi_objref_release(qurt_qdi_obj_t *objptr) +{ + if (qurt_qdi_obj_ref_dec(objptr) == 1) { + (*objptr->release)(objptr); + } +} + +/**@ingroup func_qurt_qdi_copy_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the driver buffer. + @param[in] src Base address of the user buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_FROM_USER, + dest, src, len); +} + +/**@ingroup qurt_qdi_copy_string_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param dest Base address of the driver buffer. + @param src Base address of the user buffer. + @param len Number of bytes to copy. NOTE: This is the destination buffer length. + + @return + Negative error result -- privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len); + +/**@ingroup func_qurt_qdi_copy_to_user + Copies the contents of a driver memory buffer to user memory. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the user buffer. + @param[in] src Base address of the driver buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_TO_USER, + dest, src, len); +} + +/**@ingroup func_qurt_qdi_safe_cache_ops + Do cache operations on user memory + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] addr Base address of the user memory. + @param[in] size Size of the user memory. + @param[in] opcode Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...) + @param[in] type Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE) + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size, + qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SAFE_CACHE_OPS, + addr, size, opcode, type); +} + + +/**@ingroup func_qurt_qdi_buffer_lock + Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI + client. + + This function is used to permit a trusted driver to safely access memory that is + provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + + This function performs the following security checks: \n + - Verifies that the entire buffer is accessible to the client. \n + - Ensures that the pointer remains valid for the remainder of the QDI driver + operation. \n + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] buf Pointer to the base address of the client buffer address. + @param[in] len Buffer length (in bytes). + @param[in] perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + - #QDI_PERM_R -- Read access \n + - #QDI_PERM_W -- Write access \n + - #QDI_PERM_RW -- Read/write access @tablebulletend + @param[out] obuf Pointer to the buffer address that the driver must use to access the buffer. + + @return + Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n + Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission. + In this case the QDI driver call must be terminated cleanly, with an appropriate error code + returned to the client. \n + Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the + pointer passed in as buf -- even if the user process changes the mapping of memory at buf, + the mapping of memory at *obuf remains valid until the driver invocation completes. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK, + buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_lock2 + Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI + client. + This API permits a trusted driver to safely access memory + provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + This function performs the following security checks: \n + -- Entire buffer is accessible to the client. \n + -- Entire buffer is mapped with permissions passed in perms field \n + -- Entire buffer is physically contiguous \n + In addition to the security checks, the API also locks the client mapping such that the client + cannot remove the mapping while the physical memory is used by the trusted + driver. \n + + @note1 Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not + pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client. If the client exits abruptly, the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Safe buffer + unmapping or user buffer unlock is not supported in Island mode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + -- #QDI_PERM_R -- Read access \n + -- #QDI_PERM_W -- Write access \n + -- #QDI_PERM_RW -- Read/write access \n + @param obuf Optional parameter that returns a pointer to the buffer address that + the driver must use to access the buffer. If NULL is passed, the API + only performs security checks and does not create a mapping to access the user buffer in + a safe way. + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n + QURT_EFAILED -- Mapping cannot be created for the trusted driver. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_LOCK, buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_unlock + This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping + created for the driver is removed. Client mapping for the user buffer is + unlocked. + + @note1 Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not + pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client, and if the client exits abruptly, all the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Actual + unmapping of driver accessible memory or unlocking of the buffer is not + supported in Island bode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param obuf Safe buffer address that was returned in the obuf field after calling + qurt_qdi_buffer_lock2(). + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. \n + other results -- Safe buffer unmapping failed or unlocking of user buffer failed \n. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len, + void *obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_UNLOCK, buf, len, obuf); +} + +/**@ingroup func_qurt_qdi_user_malloc + Allocates memory area in the QDI heap that is read/write accessible to both the driver and + the client. \n + @note1hang The QDI heap has a limited amount of memory available, and only the + device driver can free the allocated memory. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param size Size. + + @return + Non-zero -- Success; this returned value points to the allocated memory area. \n + Zero -- Error. + + @dependencies + None. +*/ +void *qurt_qdi_user_malloc(int client_handle, unsigned size); + +/**@ingroup func_qurt_qdi_user_free + Deallocates memory area in the QDI heap. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param ptr Pointer. + + @dependencies + None. +*/ +void qurt_qdi_user_free(int client_handle, void *ptr); + +/**@ingroup funct_qurt_qdi_client_detach + Detaches a client (a process), indicating that the client does not + participate in the qurt_wait() mechanism. This behavior + is opt-in and irrevocable. When a client is detached, it can + not be un-detached. + + @param client_handle Handle of the client to detach. + + @return + Zero -- Success. Detachable clients always return success. + Nonzero value -- client_handle did not refer to a + detachable user client. + + @dependencies + None. +*/ +static __inline int qurt_qdi_client_detach(int client_handle) +{ + return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH); +} + +/**@ingroup func_qurt_qdi_signal_group_create + Creates a new signal group for use in a device driver. + A QDI signal group contains up to 32 signals, which can be operated on either + individually (using the qurt_qdi_signal_* functions) or as a group (using the + qurt_qdi_signal_group_* functions). \n + @note1hang Driver implementation is responsible for using the proper signal group + handle in any given situation. \n + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param p_signal_group_handle_local Returns a handle intended for use by code that + resides in the same context and process as the created signal group + (for example, the device driver implementation that allocated the + signal group). + @param p_signal_group_handle_remote Returns a handle intended for use by code + that resides in a different context and process than the created signal group + (for example, the user-mode client of an OS driver). + + @return + Zero return value indicates success.\n + Negative return value indicates could not create signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_create(int client_handle, + int *p_signal_group_handle_local, + int *p_signal_group_handle_remote) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE, + p_signal_group_handle_local, + p_signal_group_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_group_wait + Suspends the current thread until any of the signals are set in the specified signal group. + + If a signal is set in a signal group object, and a thread waits on the signal group object, + the thread is awakened. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @param signal_group_handle Handle of the signal group. + + @return + If the client is remote: + QURT_EOK -- Wait complete \n + QURT_ECANCEL -- Wait cancelled.\n + If the client is local, returns a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_wait(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_group_poll + Returns a value that indicates if any of the signals are set in the specified signal group. + + @param signal_group_handle Handle of the signal group. + + @return + 1 -- Indicates whether any of the signals are set in the signal group.\n + 0 -- Indicates that none of the signals are set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_poll(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_POLL); +} + + +/**@ingroup func_qurt_qdi_signal_create + Creates a new signal in the specified signal group. + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @note1hang Driver implementation is responsible for using the proper signal handle in + any given situation. + + @param signal_group_handle Handle of an existing signal group. + @param p_signal_handle_local Returns a handle intended for use by code that resides in + the same context and process as the created signal (for example, + the device driver implementation that allocated the signal). + @param p_signal_handle_remote Returns a handle intended for use by code that resides in + a different context and process than the created signal (for + example, the user-mode client of an OS driver). + + @return + Nonzero value -- No more signals can be created in the specified + signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_create(int signal_group_handle, + int *p_signal_handle_local, + int *p_signal_handle_remote) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_SIGNAL_CREATE, + p_signal_handle_local, + p_signal_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_set + Sets the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_set(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_SET); +} + +/**@ingroup func_qurt_qdi_signal_clear + Clears the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_clear(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_CLEAR); +} + +/**@ingroup func_qurt_qdi_signal_wait + Suspends the current thread until the specified signal is set. + If a signal is set in a signal object, and a thread waits on the signal object, the + thread is awakened. If the awakened thread has higher priority than the current thread, a + context switch may occur. + + @param signal_handle Handle of the signal. + + @return + If client is remote: + QURT_EOK -- Wait complete. \n + QURT_ECANCEL -- Wait cancelled.\n + If client is local, return a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_wait(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_poll + Returns a value that indicates if the specified signal is set. + + @param signal_handle Handle of the signal. + + @return + 1 -- Signal is set. \n + 0 -- Signal is not set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_poll(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_POLL); +} + +/**@ingroup func_qurt_qdi_devname_register + Registers a QDI device with the generic QDI object in the + current QDI context. + + This function registers an exact name or a directory prefix with a QDI opener object. + Future invocations of qurt_qdi_open() in the context of the caller invokes the + opener object if a match is detected. + + Directory prefix names are specified by ending the name with a forward slash character. + + Example of an exact name: + @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode + + Example of a directory prefix: + @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode + + Given the two registrations shown above, the only qurt_qdi_open() requests to + direct to the foobar_opener object are requests for the exact name + "/dev/foobar", Any request beginning with "/pipedev/" is directed to the + pipedev_opener object. + + The pipedev invocation function presumably examines the name argument to + determine exactly how to handle the request. The name is passed to the invocation + function in the a1.ptr argument (Section @xref{sec:invocationFunction}). + + @param name Device name or device name prefix. + @param opener Pointer to the opener object for the device. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. + + @dependencies + None. + */ +static __inline int qurt_qdi_devname_register(const char *name, + qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, + QDI_DEVNAME_REGISTER, + name, + opener); +} + +// Macros for backward compatibility with deprecated APIs +// (These will go away soon) + +#define qurt_qdi_register_devname(name, opener) \ + qurt_qdi_devname_register((name), (void *)(opener)) +#define qurt_qdi_new_handle_from_obj_t(handle, obj) \ + qurt_qdi_handle_create_from_obj_t((handle), (obj)) +#define qurt_qdi_release_handle(client_handle, handle) \ + qurt_qdi_handle_release((client_handle), (handle)) +#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \ + qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf)) +#define qurt_qdi_usermalloc(handle, size) \ + qurt_qdi_user_malloc((handle), (size)) +#define qurt_qdi_userfree(handle, ptr) \ + qurt_qdi_user_free((handle), (ptr)) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_ext.h new file mode 100755 index 0000000000000..383e1799a15d6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_ext.h @@ -0,0 +1,58 @@ +#ifndef QURT_QDI_EXT_H +#define QURT_QDI_EXT_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct qurt_qdi_ext_device { + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + struct qurt_qdi_ext_device * next; + char * instance; + fdt_node_handle context; +}; +typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr; + +/**@ingroup func_qurt_qdi_dt_register + Registers a QDI device with the generic QDI object in the current QDI context, + if and only if a compatible device node is found in the device tree. This + function serves as a device tree aware wrapper for qurt_qdi_devname_register(). + + @param name Device name or device name prefix. + @param opener Pointer to QDI ext specialized opener object for the driver. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. +*/ +static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener); +} + +static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name) +{ + device->instance = name; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_imacros.h new file mode 100755 index 0000000000000..c0a8448ac87f8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_imacros.h @@ -0,0 +1,34 @@ +#ifndef QURT_QDI_IMACROS_H +#define QURT_QDI_IMACROS_H + +/** + @file qurt_qdi_imacros.h + @brief Internal macros used for QDI. Mostly consists of tricky (and ugly) + preprocessor hacks that permit us to do varargs function invocations + where we pass optional arguments in registers and where we can do + type casting and checking automatically. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define _QDMPASTE(a,b) _QDMPASTE_(a,b) +#define _QDMPASTE_(a,b) a##b +#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0) +#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_proxy.h new file mode 100755 index 0000000000000..f1d8992ea8811 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_proxy.h @@ -0,0 +1,55 @@ +/*============================================================================= + + qurt_qdi_proxy.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef _QURT_QDI_PROXY_H +#define _QURT_QDI_PROXY_H + +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* APIs allowing operation on the proxy object directly */ +int qurt_qdi_proxy_ref_create(void); + +/* APIs allowing to operate on proxy given a known proxy handle + * 1) using qdi handle of the object + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle); +int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle); + +/* 2) using object reference + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); +int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); + +/* API allowing to associate a proxy object with a particular client given a client handle + * successfule return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_install (int client_handle, int proxy_handle); + +/* APIs allowing operation on proxy object from user client + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_add(int qdi_handle); +int qurt_client_proxy_ref_remove(int qdi_handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_QDI_PROXY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex.h new file mode 100755 index 0000000000000..a013a0bbddb1d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_RMUTEX_H +#define QURT_RMUTEX_H +/** + @file qurt_rmutex.h + Prototypes of rmutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_rmutex_init + Initializes a recursive mutex object. + The recursive mutex is initialized in unlocked state. + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_destroy + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex protects, and continues executing. + + If a thread performs a lock operation on a mutex that is already use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock_timed + Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex is protecting, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked by itself. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + If timeout expires, this wait must be terminated and no access to the mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + + */ +int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + +/**@ingroup func_qurt_rmutex_unlock + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a mutex. When the mutex is + unlocked, the thread waiting on the mutex awakens. If the awakened + thread has higher priority than the current thread, a context switch occurs. + + @note1hang When a thread unlocks a recursive mutex, the mutex is not available until + the balanced number of locks and unlocks has been performed on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock + Attempts to lock the specified recursive mutex.\n + + If a thread performs a try_lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing.\n + If a thread performs a try_lock operation on a recursive mutex that another thread has + already locked, qurt_rmutex_try_lock immediately returns with a nonzero result + value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex_try_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock_block_once + Attempts to lock a mutex object recursively. If the mutex is available, + it locks the mutex. If the mutex is held by the current thread, + it increases the internal counter and returns 0. If not, it returns a + nonzero value. + If the mutex is already locked by another thread, the caller thread is + suspended. When the mutex becomes available again (because the other + thread has unlocked it), the caller thread is awakened and tries to lock + the mutex; and if it fails, this function returns failure with a nonzero + value. If it succeeds, this function returns success with zero. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the qurt_mutex_t object. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex2.h new file mode 100755 index 0000000000000..a37e7e4458c4b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex2.h @@ -0,0 +1,183 @@ +#ifndef QURT_RMUTEX2_H +#define QURT_RMUTEX2_H +/** + @file qurt_rmutex2.h + @brief Prototypes of rmutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT rmutex2 type. + Mutex type used with rmutex2 APIs. + */ +typedef struct { + /** @cond */ + unsigned int holder __attribute__((aligned(8))); /* UGP value of the mutex holder. */ + unsigned short waiters; /* Number of waiting threads. */ + unsigned short refs; /* Number of references to this mutex. */ + unsigned int queue; /* Kernel-maintained futex queuevalue. */ + unsigned int excess_locks; /* Number of excess times the holder has locked the mutex. */ + /** @endcond */ +} qurt_rmutex2_t; +/** @} */ /* end_addtogroup mutex_types */ +/** @cond internal_only*/ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_rmutex2_init + + @deprecated use #qurt_rmutex_init instead. + + Initializes a recursive mutex object. + + The recursive mutex is initially unlocked. + + Objects of type rmutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_destroy + + @deprecated use #qurt_rmutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code must destroy an rmutex2 object prior to + deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures + that all qurt_rmutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_lock + + @deprecated use #qurt_rmutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that the mutex protects, and continues + to execute. + + If a thread performs a lock operation on a recursive mutex that another thread is using, + the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_unlock + + @deprecated use #qurt_rmutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex awakens. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_rmutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() + succeeds immediately, this function behaves similarly, returning 0 for success. + When a call to qurt_rmutex2_lock() does not succeed immediately, this function has + no effect and returns nonzero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sclk.h new file mode 100755 index 0000000000000..a83cf5f1db889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sclk.h @@ -0,0 +1,145 @@ +#ifndef QURT_SCLK_H +#define QURT_SCLK_H +/** + @file qurt_sclk.h + @brief Header file describing the APIs supported by QuRT system SCLK + feature. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + + +/*============================================================================= + + INCLUDE FILES + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + + +/** + Conversion from microseconds to sleep ticks. + */ +#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL) +#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL) +#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL) +#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS +/** + Sleep timer error margin for Qtimer is 192 ticks ~10 us. +*/ +#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq; +#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN + +/*============================================================================= + + DATA DECLARATIONS + +=============================================================================*/ + +/**@ingroup func_qurt_sysclock_get_hw_ticks + @xreflabel{sec:qurt_sysclock_get_hw_ticks} + Gets the hardware tick count.\n + Returns the current value of a 64-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation must be used with care because of the wrap-around behavior. + + @return + Integer -- Current value of 64-bit hardware counter. + + @dependencies + None. + */ +unsigned long long qurt_sysclock_get_hw_ticks (void); + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_32 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_32} + Gets the hardware tick count in 32 bits.\n + Returns the current value of a 32-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 32 bits are the lower 32 bits of the Qtimer counter. + + @return + Integer -- Current value of the 32-bit timer counter. + + @dependencies + None. + */ +static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void) +{ + //Beginning with v61 there is a HW register that can be read directly. + unsigned long count; + __asm__ __volatile__ (" %0 = c30 " : "=r"(count)); + return count; +} + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_16 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_16} + Gets the hardware tick count in 16 bits.\n + Returns the current value of a 16-bit timer counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 16 bits are based on the value of the lower 32 bits in Qtimer + counter, right shifted by 16 bits. + + @return + Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the + Qtimer counter, right shifted by 16 bits. + + @dependencies + None. + */ + + +static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void) +{ + unsigned long ticks; + + //Beginning with v61 there is a HW register that can be read directly. + __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks)); + __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks)); + + return (unsigned short)ticks; +} +unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks); +#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif /* __cplusplus */ + +#endif /* QURT_SCLK_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_secure_proc.h new file mode 100755 index 0000000000000..f40c7deb9bca1 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_secure_proc.h @@ -0,0 +1,53 @@ +#ifndef QURT_SECURE_PROC_H +#define QURT_SECURE_PROC_H + +/** + @file qurt_secure_proc.h + @brief Definitions, macros, and prototypes used for handling secure process + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_process_migrate_secure_process + Migrate the user process to Qurt secure process + + @param secure_phy_address Physical starting address of secure memory + @param secure_memory_size Size of secure memory + @param entry Entry function to secure process + + @return + EOK + Negative return value -- Error. + + @dependencies + None. +*/ +int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size, void entry(unsigned)); + +/**@ingroup qurt_process_get_migration_mem_size + get the size of all writable memory regions in a user PD. This is for preparation on secure process migration. + + @return + size of all writable memory regions in a user PD. + + @dependencies + None. +*/ +int qurt_process_get_migration_mem_size(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sem.h new file mode 100755 index 0000000000000..ee5ce4b2d94ab --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sem.h @@ -0,0 +1,252 @@ +#ifndef QURT_SEM_H +#define QURT_SEM_H +/** + @file qurt_sem.h + Prototypes of semaphore API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup semaphore_types +@{ */ + +/** QuRT semaphore type. */ +typedef union { + /** @cond */ + unsigned int raw[2] __attribute__((aligned(8))); + struct { + unsigned short val; /**< */ + unsigned short n_waiting; /**< */ + unsigned int reserved1; /**< */ + unsigned int queue; /**< */ + unsigned int reserved2; /**< */ + }X; /** @endcond */ +} qurt_sem_t; +/** @} */ /* end_addtogroup semaphore_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_sem_add + Releases access to a shared resource (the specified amount increments the semaphore count value).\n + When a thread performs an add operation on a semaphore, the specified value increments the semaphore count. + The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing. \n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel repeatedly awakens the highest-priority waiting thread and decrements + the semaphore count value until either no waiting threads remain or the + semaphore count value is zero. If any of the awakened threads has higher priority + than the current thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] amt Amount to increment the semaphore count value. + + @return + Unused integer value. + + @dependencies + None. + + */ +int qurt_sem_add(qurt_sem_t *sem, unsigned int amt); + +/**@ingroup func_qurt_sem_up + Releases access to a shared resource. When a thread performs an up operation on a semaphore, + the semaphore count value increments. The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing.\n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel awakens the highest-priority waiting thread and decrements the + semaphore count value. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); } + +/**@ingroup func_qurt_sem_down + Requests access to a shared resource. When a thread performs a down operation on a + semaphore, the result depends on the semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +int qurt_sem_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_down_timed + When a thread performs a down operation on a semaphore, the result depends on the + semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. Terminate the wait when the specified timeout expires. + If timeout expires, terminate this wait and grant no access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration); + +/**@ingroup func_qurt_sem_try_down + @xreflabel{hdr:qurt_sem_try_down} + Requests access to a shared resource (without suspend). When a thread performs a try down + operation on a semaphore, the result depends on the semaphore count value: \n + - The count value is decremented when it is nonzero. The down operation returns 0 as + the function result, and the thread gains access to the shared resource and is free to + continue executing.\n + - The count value is not decremented when it is zero. The down operation returns -1 + as the function result, and the thread does not gain access to the shared resource + and should not continue executing. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + 0 -- Success. \n + -1 -- Failure. + + @dependencies + None. + + */ +int qurt_sem_try_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init + Initializes a semaphore object. + The default initial value of the semaphore count value is 1. + + @param[out] sem Pointer to the initialized semaphore object. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_destroy + Destroys the specified semaphore.\n + @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Semaphores must not be destroyed while they are still in use. If this occur, + the behavior of QuRT is undefined. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_sem_destroy(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init_val + Initializes a semaphore object with the specified value. + + @datatypes + #qurt_sem_t + + @param[out] sem Pointer to the initialized semaphore object. + @param[in] val Initial value of the semaphore count value. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val); + +/**@ingroup func_qurt_sem_get_val + Gets the semaphore count value.\n + Returns the current count value of the specified semaphore. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Integer semaphore count value + + @dependencies + None. + */ +static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;} +int qurt_sem_down_cancellable(qurt_sem_t *sem); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SEM_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_shmem.h new file mode 100755 index 0000000000000..980557323708a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_shmem.h @@ -0,0 +1,89 @@ +#ifndef QURT_SHMEM_H +#define QURT_SHMEM_H + +/** + @file qurt_shmem.h + + @brief + Prototypes of QuRT inter-process shared memory APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MODE_T +#define MODE_T +typedef unsigned int mode_t; +#endif //MODE_T + +/** + * The shm_open() function establishes a connection between a shared memory object and a file descriptor. + * The file descriptor is used by other functions such as mmap() to refer to that shared memory object. + * + * + * @param name Pointer to string naming a shared memory object. Name has to start with "/shm/" + * @param oflag File status flags and file access modes of the open file description. Following + * flags are defined in and supported: + * O_RDONLY: oepn for read access only + * O_RDWR: Open for read or write access + * O_CREAT: If shared memory object doesn't exist, create one. + * @param mode Permission flags (currently ignored) + * + * @return file descriptor (positive number) if operation successful. + * negative error code if failed + * +*/ + +int shm_open(const char * name, int oflag, mode_t mode); + +/** + * The shm_mmap() function create a shared memory mapping in the virtual address space of the + * the calling process. + * + * @param addr The starting address for the new mapping is specified in addr. + * @param len Specifies the lengh of the shared memory region. + * @param prot Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX. + * @param flags Determines whether updates to the mapping is visible or not to other process. Same as + * the one in mmap of POSIX. + * @param fd The starting adddress for the new mapping is returned. + * @param offset unused. + * + * @return The starting adddress for the new mapping is returned. + * negative error code if failed + * +*/ + +void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset); + +/** + * The shm_close() function removes a connection between a shared memory object and a file descriptor. + * If there is no file descriptor connects to the shared memory object, the shared memory object will + * be deleted automatically. Shared memory object has same virtual address in any process. This is + * restriction of single virtual address space. + * + * + * @param fd File descriptor of shared memory object + * + * @return 0 if operation successful. + * negative error code if failed + * +*/ + + +int shm_close(int fd); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal.h new file mode 100755 index 0000000000000..3a89c53394ad5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal.h @@ -0,0 +1,518 @@ +#ifndef QURT_SIGNAL_H +#define QURT_SIGNAL_H + +/** + @file qurt_signal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup signals_types +@{ */ +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 /**< Wait any. */ +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 /**< Wait all. */ + +/*===================================================================== + Typedefs + ======================================================================*/ + + +/** QuRT signal type. + */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int signals; + unsigned int waiting; + unsigned int queue; + unsigned int attribute; + }X; + /** @endcond */ +} qurt_signal_t; + + +/** QuRT 64-bit signal type. + */ +typedef struct { + /** @cond */ + qurt_signal_t signal_sum; + unsigned long long signals; + unsigned long long waiting; + /** @endcond */ +} qurt_signal_64_t; +/** @} */ /* end_addtogroup signals_types */ +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal_init + Initializes a signal object. + Signal returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_init(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_destroy + Destroys the specified signal object. + + @note1hang Signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_destroy(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting on the signal. + + If a thread is waiting on a signal object for any of the specified set of signals to set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared when the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread waits to set any of the signals, or to set all of + them. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_timed + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set or until timeout. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared after the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value that identifies the individual signals in the signal object to wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] signals Bitmask of signals that are set + @param[in] duration Duration (microseconds) to wait. Must be in the range + [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION] + + @return + #QURT_EOK -- Success; one or more signals were set \n + #QURT_ETIMEDOUT -- Timed-out \n + #QURT_EINVALID -- Duration out of range + + @dependencies + Timed-waiting support in the kernel. +*/ +/* ======================================================================*/ +int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, unsigned int *signals, unsigned long long int duration); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_any + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on the thread. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_all + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to set the signal, and 0 indicates not to set it. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_set(qurt_signal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 32-bit word with current signals + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_get(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_clear + Clear signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal_wait_cancellable + @xreflabel{hdr:qurt_signal_wait_cancellable} + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, + unsigned int *return_mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_init + Initializes a 64-bit signal object.\n + The signal argument returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore. + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_init(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_destroy + Destroys the specified signal object. + + @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_destroy(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_wait + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value, which identifies the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set it. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifiying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_64_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 64-bit double word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_clear + Clears signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask); + +#ifdef __cplusplus +} +#endif + +#endif /* QURT_SIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal2.h new file mode 100755 index 0000000000000..43975100cbf75 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal2.h @@ -0,0 +1,340 @@ +#ifndef QURT_SIGNAL2_H +#define QURT_SIGNAL2_H + +/** + @file qurt_signal2.h + @brief Prototypes of kernel signal2 API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 + +/*===================================================================== + Typedefs + ======================================================================*/ + +/** @addtogroup signals2_types +@{ */ +/** qurt_signal2 type. + */ +typedef union { + /** @cond */ + struct{ + unsigned int cur_mask; /* Current set of signal bits that are set. */ + unsigned int sig_state; /* Current state. */ + /* Bit 0 -- in anysignal wait. */ + /* Bit 1 -- in allsignal wait. */ + /* Bit 2 -- in interrupt wait. */ + /* Bits 31-3 -- reference count field. */ + unsigned int queue; /* Kernel-maintained futex queue value. */ + unsigned int wait_mask; /* When sig_state indicates a waiter is present, this is the wait mask. */ + }; + unsigned long long int raw; + /** @endcond */ +} qurt_signal2_t; +/* @} */ /* end_addtogroup signals2_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_init + + @deprecated use #qurt_signal_init instead. + + Initializes a signal2 object. + Signal returns the initialized object. + The signal object is initially cleared. + + Objects of type signal2 solve a potential race condition between + set() and destroy() operations. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + Each mutex-based object has an associated + kernel resource(s), therefore users must call qurt_signal2_destroy() + when this object no longer in use. + */ +/* ======================================================================*/ +void qurt_signal2_init(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_destroy + + @deprecated use #qurt_signal_destroy instead. + + Destroys the specified signal object. + + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont Application code should destroy a signal2 object prior to deallocating it. + Calling qurt_signal2_destroy() before deallocating a + signal2 object ensures completion of all qurt_signal2_set() calls. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_destroy(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait + + @deprecated use #qurt_signal_wait instead. + + Suspends the current thread until the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when + any of the signals specified in the mask are set. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only + when all the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to wait on. + @param[in] attribute Specifies whether the thread waits for any of the signals to be set, or for all of + them to be set. Values:\n + - QURT_SIGNAL_ATTR_WAIT_ANY \n + - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_any + + @deprecated use #qurt_signal_wait_any instead. + + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened when any of the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_all + + @deprecated use #qurt_signal_wait_all instead. + + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened only when all the signals specified in the mask are set. + + @note1hang At most one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_set + + @deprecated use #qurt_signal_set instead. + + Sets signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_get + + @deprecated use #qurt_signal_get instead. + + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the signal object to access. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_get(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_clear + + @deprecated use #qurt_signal_clear instead. + + Clear signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal2_wait_cancellable + + @deprecated use #qurt_signal_wait_cancellable instead. + + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +int qurt_signal2_wait_cancellable(qurt_signal2_t *signal, + unsigned int mask, + unsigned int attribute, + unsigned int *p_returnmask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SIGNAL2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_space.h new file mode 100755 index 0000000000000..2c3f9e4496697 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_space.h @@ -0,0 +1,230 @@ +#ifndef QURT_SPACE_H +#define QURT_SPACE_H +/** + @file qurt_space.h + @brief Prototypes of QuRT process control APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** This flag is a request to the OS to suspend the processes just before calling main() +But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */ +#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP + +/** + * Creates and starts a process from ELF of a specified name. The slash symbols + * "/" or "\" are ignored. Do not include the directory name in the input. This function + * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags. + * + * @param name ELF name of the executable. Name shall not contain directories, + * use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf" + * + * @param return + Process ID -- Success \n + Negative error code -- failure\n + #QURT_EPRIVILEGE -- Caller does not have enough privilege for this operation\n + #QURT_EMEM -- Not enough memory to perform the operation \n + #QURT_EFAILED -- Operation failed \n + #QURT_ENOTALLOWED -- Operation not allowed \n + #QURT_ENOREGISTERED -- Not registered \n + #QURT_ENORESOURCE -- Resource exhaustion \n + #QURT_EINVALID -- Invalid argument value +*/ + +int qurt_spawn_flags(const char * name, int flags); + +/** + Creates and starts a process from an ELF of the specified name. The slash symbols + "/" or "\" are ignored. Do not include the directory name in the input. + + @param name ELF name of the executable. Name shall not contain directories, + use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf". + + @return + Process ID -- Success. \m + Negative error code -- Failure. + +*/ +static inline int qurt_spawn(const char *name) +{ + return qurt_spawn_flags(name,0); +} + +/** + * Returns the process ID of the current process. + * + * @return + * Process ID + * +*/ +#define qurt_getpid qurt_process_get_id + +/** + * The qurt_wait() function waits for status change in a child process. It could be used by parent + * process to block on any child process terminates. + * + * This API returns error if there are no user processes or all user processes got detached. + * + * @param status Pointer to status variable. The variable provides the status value of child process. + * The value comes from exit() system call made by child process. + * + * @return + Process ID of the child process that changes status -- Success \n + * Negative error code -- Failure + * +*/ + +int qurt_wait(int *status); + + +/** @cond */ +/* APIs that allow registering callbacks on spawn of user pd */ +typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr); //no return, since we won't be error checking it in spawn +typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info); +typedef union { + QURT_SPAWN_PFN spawn_pfn; + QURT_CB_PFN cb_pfn; +} qurt_process_callback_pfn_t; +/** @endcond */ + +/** @cond internal_only */ + +/**@ingroup func_qurt_event_register +Sets the specified bits by mask in the signal passed by the caller. The signal gets set +when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal +to be set. + +@datatypes + +@param[in] type QURT_PROCESS_EXIT is the only event that can be registered for. +@param[in] value Indicates the client handle of the process for which the event is registered. +@param[in] signal Pointer to the signal object to set when the event occurs. +@param[in] mask Mask bits to set in the signal. +@param[out] data Pointer to the variable that would receive the exit code of the exiting process. +@param[in] datasize Size of the data variable. + +@return +#QURT_EOK -- Success \n +#QURT_EMEM -- Not enough memory to allocate resources \n +#QURT_EVAL -- Invalid values passed to the API + +@dependencies +None. +*/ +int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size); + +/**@ingroup func_qurt_callback_register_onspawn +Allows registering for a callback on spawn of any user process. + +@datatypes +#QURT_SPAWN_PFN + +@param[in] pFn Callback function to call when any user process is spawned. +@param[in] user_data Pointer to the argument that the callback must be called with. + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data); + +/**@ingroup func_qurt_callback_deregister_onspawn +Allows de-registering callback on spawn. + +@param[in] callback_handle Handle returned by qurt_callback_register_onspawn. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_callback_deregister_onspawn(int callback_handle); + +/**@ingroup func_qurt_process_callback_register +Allows registering for a callback during or after image loading. +Generic callback types: + Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is + loaded, before process thread starts. Callback has no return value and has no info provided + from OS. + pFn - QURT_SPAWN_PFN + type - QURT_PROCESS_CB_GENERIC + arg1 - not used + arg2 - not used + arg3 - not used +Note callback types: + Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP), + or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info + argument in callback is populated with pointer to the mapped note corresponding to the callback. + Callback has return value, loader fails if callback returns a value that is not QURT_EOK. + pFn - QURT_CB_PFN + type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP + arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO) + arg2 - note name + arg3 - not used + +@datatypes + +@param[in] pFn Callback function to call +@param[in] type Callback type +@param[in] user_data Pointer to the argument that the callback must be called with. +@param[in] arg1 Arguments interpreted by OS based on callback type +@param[in] arg2 Arguments interpreted by OS based on callback type +@param[in] arg3 Arguments interpreted by OS based on callback type (currently not used) + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, + qurt_process_cb_type_t type, + void *user_data, + qurt_process_callback_arg_t arg1, + qurt_process_callback_arg_t arg2, + qurt_process_callback_arg_t arg3); + + + +/**@ingroup func_qurt_process_callback_deregister +Allows de-registering callback for imate loading. +@param[in] callback_handle Handle returned by qurt_process_callback_register. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_process_callback_deregister(int callback_handle); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SPACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_consts.h new file mode 100755 index 0000000000000..48a8b6a38c402 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_consts.h @@ -0,0 +1,32 @@ +#ifndef QURT_SRM_CONSTS_H +#define QURT_SRM_CONSTS_H +/** + @file qurt_srm_consts.h + @brief Type definitions for srm + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2020-2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +#define QURT_SRM_WAKEUP_REQUEST 1U << 0 /**< Value = 1: Send wakeup request to the SRM server. */ +#define QURT_SRM_SET_HANDLE 1U << 1 /**< Value = 2: Set the client handle for a new SRM client. */ +#define QURT_SRM_ALLOC_KERNEL_PAGES 1U << 2 /**< Value = 4: Allocate pages from the kernel VA space. */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_driver.h new file mode 100755 index 0000000000000..5489e3dddbcca --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_driver.h @@ -0,0 +1,140 @@ +#ifndef QURT_SRM_DRIVER_H +#define QURT_SRM_DRIVER_H +/** + @file qurt_srm_driver.h + @brief Definitions, macros, and prototypes used by SRM drivers. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + + =============================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Define qurt_srm_driver_t structure, which represents +|| the "registration" object for an SRM driver. +*/ +/** @cond internal_only */ +struct _qurt_srm_driver { + const char *name; + qurt_qdi_obj_t *obj; +}; + +typedef struct _qurt_srm_driver qurt_srm_driver_t; + +/* +|| qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke(). +|| It behaves the same, but it takes a QDI object pointer instead of a handle. +*/ + +#define qurt_srm_object_invoke(o,m,...) \ + _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__) +#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c) +#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d)) +#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e)) +#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) + +int qurt_srm_oi3(int, qurt_qdi_obj_t *, int); +int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int); +int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int); +int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int); +int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int); +int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int); +int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int); +int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int); +int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int); +int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int); + +#define QDI_SRM_INIT 192 + +/* +|| QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure. +|| +|| The three arguments are: +|| unique_id -- Unique C identifier, unused but must be a unique global symbol. +|| name -- Name of the driver by which an SRM client attempts to open it. +|| obj -- Pointer to the singleton object of the driver, which handles things such as +|| initialization and QDI_OPEN requests. +*/ + +#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \ + __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \ + { .name = xname, .obj = xobj } + + +/*@ingroup func_qurt_srm_mapping_create + Creates a memory mapping in pagetable with specified attributes + + @param[in] client_handle Client handle representing the process for which + mapping would be created. + @param[in] pageno_virt pointer to the virtual page. NULL indicates SRM + would indicate the virtual memory. + @param[in] pageno_phys physical page to be used for the mapping + @param[in] page_count number of 4k pages to be mapped + @param[in] cache_attr cache attributes for the mapping + @param[in] perm permissions to be used for the mapping + + @return value greater than 0 indicates a handle which can be passed to + qdi_close() to remove the mapping. Negative value indicates + an error. + + @dependencies + None. +*/ +int qurt_srm_mapping_create(int client_handle, + unsigned *pageno_virt, + unsigned pageno_phys, + unsigned page_count, + qurt_mem_cache_mode_t cache_attr, + qurt_perm_t perm); + + +/**@ingroup func_qurt_srm_get_pid + Gets the PID for the client_handle that is passed. + + @param[in] client_handle Client handle for which PID is required. + + @return PID of the client + Negative PID value '-1' will be returned in case of Error + + @dependencies + None. +*/ +unsigned qurt_srm_get_pid(int client_handle); + + +/*@ingroup func_qurt_srm_get_thread_id + Gets the thread id of the client requesting a service from SRM + + @param[in] None. + + @return thead id of client thread + + @dependencies + None. +*/ +qurt_thread_t qurt_srm_get_client_thread_id(void); + +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_DRIVER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_stid.h new file mode 100755 index 0000000000000..379f46aaa4b80 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_stid.h @@ -0,0 +1,73 @@ +#ifndef QURT_STID_H +#define QURT_STID_H +/** + @file qurt_stid.h + Prototypes of software thread identifier(stid) interface APIs. + A stid is 8 bit identifier that can be assigned to a software thread. + The performance monitor logic uses stid as a counting match criteria + for maskable events. stid is also used by the hardware debugger + (ISDB) to match breakpoints. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2024 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_stid_alloc + Allocate a unique stid + + @param[in] pid Process identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - Allocation success + QURT_ENORESOURCE - No stid available for allocation + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_stid_alloc(unsigned int pid, unsigned int *stid); + +/**@ingroup func_qurt_stid_release + Release the stid. + + + @param[in] pid Process identifier + @param[in] stid STID to release + + @note1hang + User shall ensure to clear the released stid from process or thread(s) + to default value (QURT_STID_DEFAULT) before releasing that stid + + @return + QURT_EOK - Release success + QURT_ENOTALLOWED - Operation not allowed for a pid + QURT_EINVALID - Invalid stid + + @dependencies + None. + */ +int qurt_stid_release(unsigned int pid, unsigned int stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_STID_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread.h new file mode 100755 index 0000000000000..499699e7c72e2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread.h @@ -0,0 +1,1260 @@ +#ifndef QURT_THREAD_H +#define QURT_THREAD_H +/** + @file qurt_thread.h + @brief Prototypes of Thread API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2020-2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +#include +#include "qurt_pmu.h" +#include "qurt_api_version.h" +#endif /* __ASSEMBLER__ */ +#include "qurt_consts.h" +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/* + Bitmask configuration to select DSP hardware threads. + To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL + and the following: \n + - For QDSP6 V2/V3, all six hardware threads are selected \n + - For QDSP6 V3L, all four hardware threads are selected \n + - For QDSP6 V4, all three hardware threads are selected + */ + +#define QURT_THREAD_CFG_BITMASK_HT0 0x00000001 /**< HTO. */ +#define QURT_THREAD_CFG_BITMASK_HT1 0x00000002 /**< HT1. */ +#define QURT_THREAD_CFG_BITMASK_HT2 0x00000004 /**< HT2. */ +#define QURT_THREAD_CFG_BITMASK_HT3 0x00000008 /**< HT3. */ +#define QURT_THREAD_CFG_BITMASK_HT4 0x00000010 /**< HT4. */ +#define QURT_THREAD_CFG_BITMASK_HT5 0x00000020 /**< HT5. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{sec:qurt_thread_cfg} */ + +#define QURT_THREAD_CFG_BITMASK_ALL 0x000000ffU /**< Select all the hardware threads. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_CFG_USE_RAM 0x00000000 /**< Use RAM. */ +#define QURT_THREAD_CFG_USE_TCM 0x00000100 /**< Use TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_BUS_PRIO_DISABLED 0 /**< Thread internal bus priority disabled. */ +#define QURT_THREAD_BUS_PRIO_ENABLED 1 /**< Thread internal bus priority enabled. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_AUTOSTACK_DISABLED 0 /**< Thread has autostack v2 feature disabled. */ +#define QURT_THREAD_AUTOSTACK_ENABLED 1 /**< Thread has autostack v2 feature enabled. */ + +/* + Macros for QuRT thread attributes. + */ +#define QURT_HTHREAD_L1I_PREFETCH 0x1 /**< Enables hardware L1 instruction cache prefetching. */ +#define QURT_HTHREAD_L1D_PREFETCH 0x2 /**< Enables hardware L1 data cache prefetching. */ +#define QURT_HTHREAD_L2I_PREFETCH 0x4 /**< Enables hardware L2 instruction cache prefetching. */ +#define QURT_HTHREAD_L2D_PREFETCH 0x8 /**< Enables hardware L2 data cache prefetching. */ +#define QURT_HTHREAD_DCFETCH 0x10 /**< Enables DC fetch to the provided virtual address. + DC fetch indicates the hardware that a data memory access is likely. + Instructions are dropped when there is high bus utilization. */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{hdr:partition_tcm} */ +/* + Below value is used to create legacy QuRT threads by default. + If a thread has this as the detach_state, the thread can be joined + on until it exits. When we are able to change default behavior of all + QuRT threads to JOINABLE (posix default), we can remove this legacy + behavior. +*/ +#define QURT_THREAD_ATTR_CREATE_LEGACY 0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */ +#define QURT_THREAD_ATTR_CREATE_JOINABLE 1U /**< Create a joinable thread. */ +#define QURT_THREAD_ATTR_CREATE_DETACHED 2U /**< Create a detached thread. */ +/** @} */ /* end_addtogroup thread_macros */ + + +#define QURT_THREAD_ATTR_NAME_MAXLEN 16 /**< Maximum name length. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_RAM 0 /**< Creates threads in RAM/DDR. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_TCM 1 /**< Creates threads in TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT QURT_THREAD_ATTR_TCB_PARTITION_RAM /**< Backward compatibility. */ +#define QURT_THREAD_ATTR_PRIORITY_DEFAULT 254 /**< Priority.*/ +#define QURT_THREAD_ATTR_ASID_DEFAULT 0 /**< ASID. */ +#define QURT_THREAD_ATTR_AFFINITY_DEFAULT (-1) /**< Affinity. */ +#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT 255 /**< Bus priority. */ +#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT 0 /**< Default autostack v2 disabled thread. */ +#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT (-2) /**< Timetest ID. */ +#define QURT_THREAD_ATTR_STID_DEFAULT QURT_STID_DEFAULT /**< STID. */ +#define QURT_THREAD_ATTR_STID_ENABLE 1 /**< Indicate to allocate STID during thread creation. */ + +#define QURT_PRIORITY_FLOOR_DEFAULT 255U /**< Default floor. */ +/** @} */ /* end_addtogroup thread_macros */ + +// Option for suspending thread +#define QURT_THREAD_SUSPEND_SYNCHRONOUS 0x0U // bit#0 +#define QURT_THREAD_SUSPEND_ASYNCHRONOUS 0x1U // bit#0 +#define QURT_THREAD_SUSPEND_KEEP_HMX 0x0U // bit#1 +#define QURT_THREAD_SUSPEND_DETACH_HMX 0x2U // bit#1 + +// Option for resuming thread +#define QURT_THREAD_RESUME_DEFAULT 0x0 + +// Thread property IDs +#define QURT_THREAD_PROPERTY_SUSPENDABLE 0x0U +#define QURT_THREAD_PROPERTY_RESUMABLE 0x1 + +// Thread group +#define QURT_THREAD_DEFAULT_GROUP_ID 0x0U +#define QURT_THREAD_GROUP_ID_MASK 0x3FU + +/** @endcond*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup thread_types +@{ */ +/** @cond rest_reg_dist */ +typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */ + +#define CCCC_PARTITION 0U /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */ +#define MAIN_PARTITION 1U /**< Use the main partition. */ +#define AUX_PARTITION 2U /**< Use the auxiliary partition. */ +#define MINIMUM_PARTITION 3U /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */ +/** @endcond */ + +/** Thread ID type. */ +typedef unsigned int qurt_thread_t; + +/** @cond rest_reg_dist */ +/** Thread attributes. */ +typedef struct _qurt_thread_attr { + + char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */ + unsigned char tcb_partition; /**< Indicates whether the thread TCB resides in RAM or + on chip memory (TCM). */ + unsigned char stid; /**< Software thread ID used to configure the stid register + for profiling purposes. */ + unsigned short priority; /**< Thread priority. */ + unsigned char autostack:1; /**< Autostack v2 enabled thread. */ + unsigned char group_id:6; /**< Group ID. */ + unsigned char reserved:1; /**< Reserved bits. */ + unsigned char bus_priority; /**< Internal bus priority. */ + unsigned short timetest_id; /**< Timetest ID. */ + unsigned int stack_size; /**< Thread stack size. */ + void *stack_addr; /**< Pointer to the stack address base. The range of the stack is + (stack_addr, stack_addr+stack_size-1). */ + unsigned short detach_state; /**< Detach state of the thread. */ + +} qurt_thread_attr_t; +/** @endcond */ + +/** @cond rest_reg_dist */ +/** Dynamic TLS attributes. */ +typedef struct qurt_tls_info { + unsigned int module_id; /**< Module ID of the loaded dynamic linked library. */ + unsigned int tls_start; /**< Start address of the TLS data. */ + unsigned int tls_data_end; /**< End address of the TLS RW data. */ + unsigned int tls_end; /**< End address of the TLS data. */ +}qurt_tls_info; +/** @endcond */ + +/** @} */ /* end_addtogroup thread_types */ + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_thread_attr_init + Initializes the structure used to set the thread attributes when a thread is created. + After an attribute structure is initialized, Explicity set the individual attributes in the structure + using the thread attribute operations. + + The initialize operation sets the following default attribute values: \n + - Name -- NULL string \n + - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT + - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n + - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n + - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n + - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n + - stack_size -- 0 \n + - stack_addr -- NULL \n + - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n + - STID -- #QURT_THREAD_ATTR_STID_DEFAULT + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr) +{ + + attr->name[0] = '\0'; + attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT; + attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT; + attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/ + attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT; + attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT; + attr->stack_size = 0; + attr->stack_addr = NULL; + attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY; + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID; +} + +/**@ingroup func_qurt_thread_attr_set_name + Sets the thread name attribute.\n + This function specifies the name to use by a thread. + Thread names identify a thread during debugging or profiling. + Maximum name length is 16 charactes \n + @note1hang Thread names differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] name Pointer to the character string containing the thread name. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name) +{ + strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN); + attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0'; +} + + +/**@ingroup func_qurt_thread_attr_set_tcb_partition + Sets the thread TCB partition attribute. + Specifies the memory type where a TCB of a thread is allocated. + Allocates TCBs in RAM or TCM/LPM. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] tcb_partition TCB partition. Values:\n + - 0 -- TCB resides in RAM \n + - 1 -- TCB resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition) +{ + attr->tcb_partition = tcb_partition; +} + +/**@ingroup func_qurt_thread_attr_set_priority + Sets the thread priority to assign to a thread. + Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing + the highest priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] priority Thread priority. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority) +{ + attr->priority = priority; +} + +/**@ingroup func_qurt_thread_attr_set_detachstate + Sets the thread detach state with which thread is created. + Thread detach state is either joinable or detached; specified by the following values: + - #QURT_THREAD_ATTR_CREATE_JOINABLE \n + - #QURT_THREAD_ATTR_CREATE_DETACHED \n + + When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread + ID and other resources are reclaimed as soon as the thread exits. When a joinable thread + is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some + thread waits to join on it using a qurt_thread_join() call. + By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY + If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other + thread can join before thread exits but it will not wait other thread to join. + + @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very + important that some thread joins on it after it terminates, otherwise + the resources of that thread are not reclaimed, causing memory leaks. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] detachstate Thread detach state. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate) +{ + if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){ + attr->detach_state = detachstate; + } +} + + +/**@ingroup func_qurt_thread_attr_set_timetest_id + Sets the thread timetest attribute.\n + Specifies the timetest identifier to use by a thread. + + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] timetest_id Timetest identifier value. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id) +{ + attr->timetest_id = timetest_id; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute.\n + Specifies the size of the memory area to use for a call stack of a thread. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_size Size (in bytes) of the thread stack. + + @return + None. + + @dependencies + None. +*/ + +static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size) +{ + attr->stack_size = stack_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size2 + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size + defined in the configuration XML.\n + Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] user_stack_size Size (in bytes) of the stack usage in User mode. + @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size) +{ + union qurt_thread_stack_info{ + unsigned int raw_size; + struct{ + unsigned short user_stack; + unsigned short root_stack; + }; + }user_root_stack_size; + user_root_stack_size.user_stack = user_stack_size; + user_root_stack_size.root_stack = root_stack_size; + + attr->stack_size = user_root_stack_size.raw_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_addr + @xreflabel{sec:set_stack_addr} + Sets the thread stack address attribute. \n + Specifies the base address of the memory area to use for a call stack of a thread. + + stack_addr must contain an address value that is 8-byte aligned. + + The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a + call stack for the thread. \n + @note1hang The user is responsible for allocating the memory area used for the thread + stack. The memory area must be large enough to contain the stack that the thread + creates. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_addr Pointer to the 8-byte aligned address of the thread stack. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr) +{ + attr->stack_addr = stack_addr; +} + +/**@ingroup func_qurt_thread_attr_set_bus_priority + Sets the internal bus priority state in the Hexagon core for this software thread attribute. + Memory requests generated by the thread with bus priority enabled are + given priority over requests generated by the thread with bus priority disabled. + The default value of bus priority is disabled. + + @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. + The priority is not propagated to the bus fabric. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + + @param[in] bus_priority Enabling flag. Values: \n + - #QURT_THREAD_BUS_PRIO_DISABLED \n + - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority) +{ + attr->bus_priority = (unsigned char)bus_priority; +} + +/**@ingroup func_qurt_thread_attr_set_autostack + Enables autostack v2 feature in the thread attributes. + + When autostack is enabled by the subsystem, in the case that + an autostack enabled thread gets framelimit exception, kernel will + allocate more stack for thread and return to normal execution. + + If autostack is not enabled by the subsystem, or it is not enabled + for the thread, the framelimit exception will be fatal. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] autostack Autostack enable or disable flag. Values: \n + - #QURT_THREAD_AUTOSTACK_DISABLED \n + - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack) +{ + attr->autostack = (unsigned char)autostack; +} +/**@ingroup qurt_thread_attr_enable_stid + Set STID in the thread attributes. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] enable_stid STID to be set. Values: \n + - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n + - #QURT_THREAD_ATTR_STID_ENABLE (1): QuRT assigns an STID that is not already in use \n + - #2 through #255 : User provided STID. @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid) +{ + if (enable_stid != '\0') { + attr->stid = enable_stid; + } + else + { + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + } +} + +/**@ingroup func_qurt_thread_attr_set_stid + Sets the stid thread attribute. + The default stid value is QURT_THREAD_ATTR_STID_DEFAULT + + @note1hang When a thread is created with non default stid , + the stid set in thread attribute will be assigned to a thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] stid Stid to be set for a thread. + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){ + attr->stid = stid; +} + +/**@ingroup func_qurt_thread_attr_set_group_id + Sets group id in the thread attributes. + Primordial/first thread has group ID 0. + If a new thread is created without assigning group_id, it + inherits the group ID from its parent thread. + + @note1hang + 1) Group ID can only be set before creating a thread. It cannot be + changed after the thread is created. + 2) If a non-activated group_id is passed, thread creation will fail. + 3) Only a thread with Group ID #0 can set Group ID for its child threads. + 4) If thread with non-zero group ID set the group ID for its child threads, + QuRT will ingore this parameter and child threads will inherit the parent + thread's group ID. But if passed group ID is not activated, thread creation + will still fail. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] group_id Group identifier. Its valid range is 0 ~ 63 + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id) +{ + attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK; +} + +/**@ingroup func_qurt_thread_set_autostack + Sets autostack enable in the TCB. + + @param[in] Pointer to UGP + + @return + None. + + @dependencies + None. +*/ + +void qurt_thread_set_autostack(void *); + + +/**@ingroup func_qurt_thread_get_name + Gets the thread name of current thread.\n + Returns the thread name of the current thread. + Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names + identify a thread during debugging or profiling. + + @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored. + @param[in] max_len Maximum length of the character string that can be returned. + + @return + None. + + @dependencies + None. +*/ +void qurt_thread_get_name (char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_create + @xreflabel{hdr:qurt_thread_create} + Creates a thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + @param[in] entrypoint C function pointer, which specifies the main function of a thread. + @param[in] arg Pointer to a thread-specific argument structure + + + @return + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg); + +/**@ingroup func_qurt_thread_stop + Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. + + @return + void + + @dependencies + None. + */ +void qurt_thread_stop(void); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_resume + When a demand-loading paging solution is enabled, this function + will resumes the execution of a thread that was suspended due to + a page miss. + + @param[in] thread_id Thread identifier. + + @return + #QURT_EOK -- Thread successfully resumed. \n + #QURT_EFATAL -- Resume operation failed. + + @dependencies + None. + */ +int qurt_thread_resume(unsigned int thread_id); +/** @endcond */ + +/**@ingroup func_qurt_thread_get_id + Gets the identifier of the current thread.\n + Returns the thread identifier for the current thread. + + @return + Thread identifier -- Identifier of the current thread. + + @dependencies + None. + */ +qurt_thread_t qurt_thread_get_id (void); + + +/**@ingroup func_qurt_thread_get_l2cache_partition + Returns the current value of the L2 cache partition assigned to the caller thread.\n + + @return + Value of the #qurt_cache_partition_t data type. + + @dependencies + None. + */ +qurt_cache_partition_t qurt_thread_get_l2cache_partition (void); + +/**@ingroup func_qurt_thread_set_timetest_id + Sets the timetest identifier of the current thread. + Timetest identifiers are used to identify a thread during debugging or profiling.\n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @param[in] tid Timetest identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_timetest_id (unsigned short tid); + +/**@ingroup func_qurt_thread_set_cache_partition + Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type + to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache. + + @datatypes + #qurt_cache_partition_t + + @param[in] l1_icache L1 I cache partition. + @param[in] l1_dcache L1 D cache partition. + @param[in] l2_cache L2 cache partition. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache); + + +/**@ingroup func_qurt_thread_get_timetest_id + Gets the timetest identifier of the current thread.\n + Returns the timetest identifier of the current thread.\n + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @return + Integer -- Timetest identifier. + + @dependencies + None. + */ +unsigned short qurt_thread_get_timetest_id (void); + +/**@ingroup func_qurt_thread_exit + @xreflabel{sec:qurt_thread_exit} + Stops the current thread, awakens threads joined to it, then destroys the stopped + thread. + + Threads that are suspended on the current thread (by performing a thread join + Section @xref{sec:thread_join}) are awakened and passed a user-defined status value + that indicates the status of the stopped thread. + + @note1hang Exit must be called in the context of the thread to stop. + + @param[in] status User-defined thread exit status value. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_exit(int status); + +/**@ingroup func_qurt_thread_join + @xreflabel{sec:thread_join} + Waits for a specified thread to finish; the specified thread is another thread within + the same process. + The caller thread is suspended until the specified thread exits. When the unspecified thread + exits, the caller thread is awakened. \n + @note1hang If the specified thread has already exited, this function returns immediately + with the result value #QURT_ENOTHREAD. \n + @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish. + If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}). + + @param[in] tid Thread identifier. + @param[out] status Destination variable for thread exit status. Returns an application-defined + value that indicates the termination status of the specified thread. + + @return + #QURT_ENOTHREAD -- Thread has already exited. \n + #QURT_EOK -- Thread successfully joined with valid status value. + + @dependencies + None. + */ +int qurt_thread_join(unsigned int tid, int *status); + +/**@ingroup qurt_thread_detach + @xreflabel{sec:thread_detach} + Detaches a joinable thread. The specified thread is another thread within the + same process. Create the thread as a joinable thread; only joinable threads + can be detached. + If a joinable thread is detached, it finishes execution and exits. + + @param[in] tid Thread identifier. + + @return + #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n + #QURT_EOK -- Thread successfully detached. + + @dependencies + None. + */ +int qurt_thread_detach(unsigned int tid); + + +/**@ingroup func_qurt_thread_get_priority + Gets the priority of the specified thread. \n + Returns the thread priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. \n + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + + @return + -1 -- Invalid thread identifier. \n + 1 through 254 -- Thread priority value. + + @dependencies + None. + */ +int qurt_thread_get_priority (qurt_thread_t threadid); + +/**@ingroup func_qurt_thread_set_priority + Sets the priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. For more + information, see Section @xref{sec:AppDev}. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + @param[in] newprio New thread priority value. + + @return + 0 -- Priority successfully set. \n + -1 -- Invalid thread identifier. \n + + @dependencies + None. + */ +int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio); + + + +/**@ingroup func_qurt_thread_attr_get + Gets the attributes of the specified thread. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[out] attr Pointer to the destination structure for thread attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid argument. + + @dependencies + None. + */ +int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr); + + + +/**@ingroup func_qurt_thread_get_tls_base + Gets the base address of thread local storage (TLS) of a dynamically loaded module + for the current thread. + + @datatypes + #qurt_tls_info + + @param[in] info Pointer to the TLS information for a module. + + @return + Pointer to the TLS object for the dynamically loaded module.\n + NULL -- TLS information is invalid. + + @dependencies + None. + */ +void * qurt_thread_get_tls_base(qurt_tls_info* info); + +/**@ingroup func_qurt_thread_pktcount_get + Gets the PKTCOUNT of a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + PKTCOUNT + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_get (qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_pktcount_set + Sets the PKTCOUNT for the current QuRT thread. + + @return + Value to which pktcount is set. + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_set (long long int); + +/**@ingroup func_qurt_thread_stid_get + Gets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + STID + + @dependencies + None. + */ + +char qurt_thread_stid_get(qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_stid_get2 + Returns the set stid for a thread + + @param[in] thread_id thread identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - success + QURT_ENOTALLOWED - operation not allowed for a thread + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid); + +/**@ingroup func_qurt_thread_stid_set + Sets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] stid Thread identifier. + + @return + #QURT_EOK -- STID set created. \n + #QURT_EFAILED -- STID not set. + + @dependencies + None. + */ + +int qurt_thread_stid_set(char stid); + +/**@ingroup qurt_thread_stid_set2 + Sets the stid for a specified thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[in] stid Stid to be set for a thread. + + @return + QURT_EOK -- Success + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_EVAL -- Failure because of invalid inputs. + + @dependencies + None. +*/ +int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_get_running_ids + Returns the thread IDs of the running threads in the system; use only during fatal error handling. + + @datatypes + #qurt_thread_t + + @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1. + + @return + #QURT_EINVALID -- Incorrect argument \n + #QURT_ENOTALLOWED -- API not called during error handling \n + #QURT_EOK -- Success, returns a NULL-terminated array of thread_id + + @dependencies + None. + */ +int qurt_thread_get_running_ids(qurt_thread_t *); +/** @endcond */ + + +/**@ingroup func_qurt_thread_get_thread_id + Gets the thread identifier of the thread with the matching name in the same process + of the caller. + + @datatypes + #qurt_thread_t + + @param[out] thread_id Pointer to the thread identifier. + @param[in] name Pointer to the name of the thread. + + @return + #QURT_EINVALID -- No thread with matching name in the process of the caller \n + #QURT_EOK -- Success + + @dependencies + None. + */ +int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name); + +/**@ingroup func_qurt_sleep + Suspends the current thread for the specified amount of time. + + @note1hang Because QuRT timers are deferrable, this call is guaranteed to block + at least for the specified amount of time. If power-collapse is + enabled, the maximum amount of time this call can block depends on + the earliest wakeup from power-collapse past the specified duration. + + @param[in] duration Duration (in microseconds) for which the thread is suspended. + + @return + None. + + @dependencies + None. + */ +void qurt_sleep (unsigned long long int duration); + + +/**@ingroup func_qurt_system_set_priority_floor + Sets a priority floor to move threads with thread priority lower than the floor out of the running state. + Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they + are not scheduled to run when the thread priority is lower than the floor. + Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. + Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor. + + The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and + sets a new floor, the new floor is associated to its original user process, not the QuRTOS process. + The floor associated to the user process is reset when the user process exits or is killed, but not at the time + when the user thread of the caller exits. + + The priority floor cannot be set to a priority higher than the thread priority of the caller. + + The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor. + + This function is not supported in Island mode. + + After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task + is not scheduled to run. + + @param[in] priority_floor Priority floor. + + @return + #QURT_EOK -- Success \n + #QURT_ENOTALLOWED -- Floor setting is not allowed + + @dependencies + None. + */ +int qurt_system_set_priority_floor (unsigned int priority_floor); + + +/**@ingroup func_qurt_thread_suspend_thread + Suspend a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent process. + After the target thread is suspended, the kernel will not schedule it to run until it is resumed later. + + If the target thread is set as non-suspendable, this function call returns an error without suspending + the target thread. + + If the target thread is already suspended, this function call returns success to confirm + the target thread suspend. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + suspending the target thread. + + If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend + the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is + suspended when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target + thread can runn in the guest OS, and is suspended when exiting the guest OS. + + QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend + those threads. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, multiple options can be ORed. \n + #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call, + the function returns after the thread is completely suspended.\n + #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns + after the kernel acts to suspend the target thread. The target thread + might still be running before it is completely suspended. \n + #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread + if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n + #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock(). + Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only + supported for the caller from the same user process of the target thread, not for a caller from the parent + process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX + context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations + and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option. + If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this + case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended + state without HMX detached. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process. + #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread. + + @dependencies + None. + */ +int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_resume_thread + Resume a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent + process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on + the thread priority. + + There is an option argument in this function, with only one default option as of now, + QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way. + + By default, this is an asynchronous function. The function returns after kernel moves the + target thread from suspended state to runnable state. The thread is scheduled to run based on its + thread priority. + + If the target thread is set as non-resumable, this function call does not resume the target thread. + + If the target thread has already resumed, this function confirms that the target thread resumes + by returning success. + + If the target thread is in a secure user process or CPZ process, this function call returns an error without + resuming the operation. + + If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of + suspend-pending on the target thread, and the target thread is not suspended when it exits the + guest OS. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + #QURT_EHMXNOTAVAIL -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume. + + @dependencies + None. + */ +int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_set_thread_property + Set a QuRT thread property with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be from the same user process of the target thread, or from its parent process. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + changing the property of the target thread. + + @param[in] thread_id Thread identifier \n + @param[in] property_id Thread property identifier \n + #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n + #QURT_THREAD_PROPERTY_RESUMEABLE -- thread is resumable. Default is TRUE + @param[in] value Proper value: \n + TRUE(1) -- TRUE for the property \n + FALSE(0) -- FALSE for the property + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value ); + +/**@ingroup func_qurt_thread_get_group_id + Get the group id of the thread specified by thread_id.\n + + @param[in] thread_id Thread identifier + @param[out] group_id Pointer to the variable of group identifier + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Thread id is invalid, or the process has no groups enabled \n + #QURT_ENOTALLOWED -- Operation is not allowed \n + + @dependencies + None. +*/ +int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id); + +#endif /* __ASSEMBLER__ */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread_context.h new file mode 100755 index 0000000000000..bab09deec8889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread_context.h @@ -0,0 +1,234 @@ +#ifndef QURT_THREAD_CONTEXT_H +#define QURT_THREAD_CONTEXT_H +/** + @file qurt_thread_context.h + @brief Kernel thread context structure + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond internal_only */ + +#define THREAD_ITERATOR_END ((qurt_thread_t)(-1)) /**< Thread iterator is complete. */ + + +/**@ingroup func_qurt_thread_iterator_create +Gives the ability to the caller to enumerate threads in the system. + +@return +Handle of the newly created iterator must be passed for +subsequent operations on the iterator. + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_create(void) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE); +} + +/**@ingroup func_qurt_thread_iterator_next +Iterates over the list of threads in the system. + +@datatypes +#qurt_thread_t + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n +Other values indicate a valid thread_id. + +@dependencies +None. +*/ +static inline qurt_thread_t qurt_thread_iterator_next(int iter) +{ + return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT); +} + +/**@ingroup func_qurt_thread_iterator_destroy +Cleans up thread iterator resources. + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#QURT_EOK -- Successful completion of operation \n +#QURT_EFATAL -- Invalid handle passed + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_destroy(int iter) +{ + return qurt_qdi_close(iter); +} + +/**@ingroup func_qurt_thread_context_get_tname +Gets the name of the thread from the specified thread ID. + +@param[in] thread_id Thread for which name is returned. +@param[in,out] name Pointer to the local buffer where name is copied back. +@param[in] max_len Size of the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_context_get_prio +Gets the priority for the specified thread. + +@param[in] thread_id Thread for which priority is returned. +@param[in,out] prio Pointer to the local variable where priority is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio); + +/**@ingroup func_qurt_thread_context_get_pcycles +Gets pcycles for the specified thread. + +@param[in] thread_id Thread for which processor cycles are returned. +@param[in,out] pcycles Pointer to the local variable where processor cycles are written. + +@return +#QURT_EOK -- Success \n +Failure otherwise. + +@dependencies +None. +*/ +int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles); + +/**@ingroup func_qurt_thread_context_get_stack_base +Gets the stack base address for the specified thread. + +@param[in] thread_id Thread for which stack base address is returned. +@param[in,out] sbase Pointer to the local variable where stack base address is written. + +@return +QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase); + +/**@ingroup func_qurt_thread_context_get_stack_size +Gets the stack size for the specified thread. + +@param[in] thread_id Thread for which stack size is returned. +@param[in,out] ssize Pointer to the local variable where stack size is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize); + +/**@ingroup func_qurt_thread_context_get_pid +Gets the process ID for the specified thread. + +@param[in] thread_id Thread for which process ID is returned. +@param[in,out] pid Pointer to the local variable where process id is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid); + +/**@ingroup func_qurt_thread_context_get_pname +Gets the process name for the specified thread. + +@param[in] thread_id Represents the thread for which process name is returned. +@param[in, out] name Pointer to the local buffer where process name is copied back. +@param[in] len Length allocated to the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len); + +/** @addtogroup thread_types +@{ */ +/** Structure that defines how TCB is interpreted to crash dump tools.*/ +/* Keys are defined in consts.h */ +struct qurt_debug_thread_info { +/** @cond */ + char name[QURT_MAX_NAME_LEN]; /**< Name of the thread. */ + struct { + unsigned key; + unsigned val; + } os_info[40]; + unsigned gen_regs[32]; /**< General mode registers. */ + unsigned user_cregs[32]; /**< User mode registers. */ + unsigned guest_cregs[32]; /**< Guest mode registers. */ + unsigned monitor_cregs[64]; /**< Monitor mode registers. */ +/** @endcond */ +}; /* should add up to 1K */ +/** @} */ /* end_addtogroup thread_types */ + + +/**@ingroup func_qurt_system_tcb_dump_get +Cleans up thread iterator resources. + +@datatypes +#qurt_thread_t + +@param[in] thread_id Thread on which the operation must be performed. +@param[in, out] ptr Pointer to the local buffer where contents are written. +@param[in] size Size of the debug thread information structure obtained by calling + qurt_system_tcb_dump_get_size(). + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_CONTEXT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_timer.h new file mode 100755 index 0000000000000..7bdfdb8f3c3df --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_timer.h @@ -0,0 +1,560 @@ +#ifndef QURT_TIMER_H +#define QURT_TIMER_H +/** + @file qurt_timer.h + @brief Prototypes of qurt_timer API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include "qurt_anysignal.h" +#include "qurt_signal2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@addtogroup timer_const_macros +@{ */ +/** + Default values. +*/ +/** @xreflabel{hdr:QURT_TIMER_ONESHOT}*/ +#define QURT_TIMER_DEFAULT_TYPE QURT_TIMER_ONESHOT /**< One shot.*/ +#define QURT_TIMER_DEFAULT_DURATION 1000uL /**< Default duration. */ +#define QURT_TIMER_DEFAULT_EXPIRY 0uL /**< Default expiration. */ + +/** + Conversion from microseconds to timer ticks. + */ +#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** Minimum microseconds value is 100 microseconds (sleep timer).*/ +#define QURT_TIMER_MIN_DURATION 100uL + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_TIMER_MAX_DURATION QURT_SYSCLOCK_MAX_DURATION + +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS + +/** + Sleep timer error margin for Qtimer is 1,000 ticks ~52 us. +*/ +#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN + +/* + qurt_timer group defines. +*/ +#define QURT_TIMER_MAX_GROUPS 5U /**< Maximum groups.*/ +#define QURT_TIMER_DEFAULT_GROUP 0U /**< Default groups. */ +/** @} */ /* end_addtogroup timer_const_macros */ + +/** @addtogroup timer_types +@{ */ +/** + QuRT timer types. + */ +typedef enum +{ + QURT_TIMER_ONESHOT = 0, /**< One shot.*/ + /** @xreflabel{hdr:QURT_TIMER_PERIODIC}*/ + QURT_TIMER_PERIODIC /**< Periodic. */ +} qurt_timer_type_t; + + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT timer type.*/ +typedef unsigned int qurt_timer_t; + +/** QuRT timer duration type. */ +typedef unsigned long long qurt_timer_duration_t; + +/** QuRT timer time type. */ +typedef unsigned long long qurt_timer_time_t; + +typedef void (*pfn_t)(void); +/** QuRT timer attribute type. */ +typedef struct +{ + /** @cond */ + unsigned int magic; /**< Magic number to verify the qmsgq_attr_t pointer. */ + + qurt_timer_duration_t duration; /**< Specifies the duration of the new timer. */ + + qurt_timer_time_t expiry; /**< Specifies the absolute expiry of the new timer. */ + + qurt_timer_duration_t remaining; /**< Specifies the remaining time of an active timer. */ + + qurt_timer_type_t type; /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and + #QURT_TIMER_PERIODIC are supported. */ + + unsigned int group; /**< Group number of the timer; the criterion used to disable or enable the set + of timers. */ + pfn_t pFn; /**< Callback other than the signal set */ + /** @endcond */ +} +qurt_timer_attr_t; + +/** @} */ /* end_addtogroup timer_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_timer_stop + @xreflabel{sec:qurt_timer_stop} + Stops a running timer. + The timer must be a one-shot timer. + + @note1hang Restart stopped timers with the timer restart operation, + see Section @xref{sec:qurt_timer_restart}. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n + #QURT_EMEM -- Out of memory error. + + @dependencies + None. + */ +int qurt_timer_stop (qurt_timer_t timer); + +/**@ingroup func_qurt_timer_restart + @xreflabel{sec:qurt_timer_restart} + Restarts a stopped timer with the specified duration. The timer must be a one-shot timer. + Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop(). + A restarted timer expires after the specified duration, the starting time is when the function is called. + + @note1hang Timers stop after they have expired or after they are explicitly + stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}. + + @datatypes + #qurt_timer_t \n + #qurt_timer_duration_t + + @param[in] timer Timer object. + @param[in] duration Timer duration (in microseconds) before the restarted timer + expires again. + The valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n + #QURT_EMEM -- Out-of-memory error. + + @dependencies + None. + */ +int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration); + + +/**@ingroup func_qurt_timer_create + Creates a timer.\n + Allocates and initializes a timer object, and starts the timer. + + @note1hang A timer event handler must be defined to wait on the specified signal + to handle the timer event. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t \n + #qurt_anysignal_t + + @param[out] timer Pointer to the created timer object. + @param[in] attr Pointer to the timer attribute structure. + @param[in] signal Pointer to the signal object set when timer expires. + @param[in] mask Signal mask, which specifies the signal to set in the signal object when the + time expires. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to create the timer. \n + #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n + Other error code -- Operation failed. \n + + @dependencies + None. + */ +int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_anysignal_t *signal, unsigned int mask); + +int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_timer_attr_init + Initializes the specified timer attribute structure with default attribute values: \n + - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n + - Timer type -- #QURT_TIMER_ONESHOT \n + - Timer group -- #QURT_TIMER_DEFAULT_GROUP + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_init(qurt_timer_attr_t *attr); + + +/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020 +@ingroup func_qurt_timer_attr_set_pfn + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + @param[in] pFn pFn. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn); + + +/**@ingroup func_qurt_timer_attr_set_duration + Sets the timer duration in the specified timer attribute structure.\n + + The timer duration specifies the interval (in microseconds) between the creation of the + timer object and the generation of the corresponding timer event. + + The timer duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] duration Timer duration (in microseconds). + Valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_attr_set_expiry + Sets the absolute expiry time in the specified timer attribute structure.\n + The timer expiry specifies the absolute time (in microseconds) of the generation of the + corresponding timer event.\n + Timer expiries are relative to when the system first began executing. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_time_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] time Timer expiry. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time); + +/**@ingroup func_qurt_timer_attr_get_duration + Gets the timer duration from the specified timer attribute structure. + The value returned is the duration that was originally set for the timer. + + @note1hang This function does not return the remaining time of an active timer; + use qurt_timer_attr_get_remaining() to get the remaining time. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attributes object + @param[out] duration Pointer to the destination variable for timer duration. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration); + +/**@ingroup func_qurt_timer_attr_get_remaining + Gets the timer remaining duration from the specified timer attribute structure. \n + + The timer remaining duration indicates (in microseconds) how much time remains before + the generation of the next timer event on the corresponding timer. + In most cases this function assumes that the timer attribute structure was obtained by + calling qurt_timer_get_attr(). + + @note1hang This attribute is read-only and thus has no set operation defined for it. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attribute object. + @param[out] remaining Pointer to the destination variable for remaining time. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining); + +/**@ingroup func_qurt_timer_attr_set_type + Sets the timer type in the specified timer attribute structure. + + The timer type specifies the functional behavior of the timer: \n + - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration + and then generates a single timer event. After this the timer is nonfunctional. \n + - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified + timer duration and then generates a timer event. The result is a series of timer + events with interval equal to the timer duration. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] type Timer type. Values are: \n + - #QURT_TIMER_ONESHOT -- One-shot timer. \n + - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type); + +/**@ingroup func_qurt_timer_attr_get_type + Gets the timer type from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] type Pointer to the destination variable for the timer type. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type); + +/**@ingroup func_qurt_timer_attr_set_group + Sets the timer group identifier in the specified timer attribute structure.\n + The timer group identifier specifies the group that the timer belongs to. Timer groups are + used to enable or disable one or more timers in a single operation. \n + The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1). + See Section @xref{dox:timers}. + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the timer attribute object. + @param[in] group Timer group identifier; + Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1). + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group); + +/**@ingroup func_qurt_timer_attr_get_group + Gets the timer group identifier from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] group Pointer to the destination variable for the timer group identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group); + +/**@ingroup func_qurt_timer_get_attr + @xreflabel{hdr:qurt_timer_get_attr} + Gets the timer attributes of the specified timer when it was created. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t + + @param[in] timer Timer object. + @param[out] attr Pointer to the destination structure for timer attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr); + +/**@ingroup func_qurt_timer_delete + Deletes the timer.\n + Destroys the specified timer and deallocates the timer object. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_delete(qurt_timer_t timer); + +/**@ingroup func_qurt_timer_sleep + Suspends the current thread for the specified amount of time. + The sleep duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). + + @datatypes + #qurt_timer_duration_t + + @param[in] duration Interval (in microseconds) between when the thread is suspended + and when it is re-awakened. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to perform the operation. + + @dependencies + None. + */ + +int qurt_timer_sleep(qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_group_disable + Disables all timers that are assigned to the specified timer group. + If a specified timer is already disabled, ignore it. + If a specified timer is expired, do not process it. + If the specified timer group is empty, do nothing. + + @note1hang When a timer is disabled its remaining time does not change, thus it + cannot generate a timer event. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_disable (unsigned int group); + +/**@ingroup func_qurt_timer_group_enable + Enables all timers that are assigned to the specified timer group. + If a specified timer is already enabled, ignore it. + If a specified timer is expired, process it. + If the specified timer group is empty, do nothing. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_enable (unsigned int group); + + +/** + Notifies the timer server recovery from power collapse. The server + must account for any missed interrupts during power collapse. + */ +void qurt_timer_recover_pc (void); + +/** + Determines whether the Qtimer is initialized. + + @return + 0 -- Not initialized. \n + Nonzero -- Initialized. + */ +static inline int qurt_timer_is_init (void) {return 1;} + +/**@ingroup func_qurt_timer_get_ticks + Gets current ticks. The ticks are accumulated since the RTOS + has started. Each tick is equal to a single timer clock + cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer. + + @return + Ticks since system started. + */ +unsigned long long qurt_timer_get_ticks (void); + +#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TIMER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tlb.h new file mode 100755 index 0000000000000..b1b2d261d31c0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tlb.h @@ -0,0 +1,215 @@ +#ifndef QURT_TLB_H +#define QURT_TLB_H + +/** + @file qurt_tlb.h + @brief Prototypes of TLB API + The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. + Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed + by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. + In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently + assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. + A new entry is placed in the first available slot. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tlb_entry_create + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (such as if the address is not aligned with the + size), the entry is created and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr Physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry is not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_create_64 + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (the address is not aligned with the + size), the entry is not created, and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the asid argument to -1. + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr_64 64-bit physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry was not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_delete + Deletes the specified TLB entry from the TLB of the Hexagon processor. + If the specified entry does not exist, no deletion occurs and an error result is returned. + + @param[in] entry_id TLB entry identifier. + + @return + #QURT_EOK -- TLB entry successfully deleted. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_delete (unsigned int entry_id); + +/**@ingroup func_qurt_tlb_entry_query + Searches for the specified TLB entry in the TLB of the Hexagon processor. + If the TLB entry is found, its entry identifier is returned. + + @datatypes + #qurt_addr_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid); + +/**@ingroup func_qurt_tlb_entry_set + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[in] entry 64-bit TLB entry to store. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry); + +/**@ingroup func_qurt_tlb_entry_get + Gets the TLB entry. \n + Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[out] entry 64-bit TLB entry. + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry); + +/**@ingroup func_qurt_tlb_get_pager_physaddrs + Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_phys_addrs Pointer to the return array of pager physical addresses. + + @return + Integer -- Number of addresses returned in array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs); + +/**@ingroup func_qurt_tlb_get_pager_virtaddr + Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_virt_addrs Pointer to the return array of pager virtual addresses. + + @return + Integer -- Number of addresses returned in the array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs); + + +/**@ingroup func_qurt_tlb_entry_set2 + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. An additional option can be passed + to lock the TLB entry in the TLB of the Hexagon processor. + + @param[in] id TLB entry identifier. + @param[in] tlb 64-bit TLB entry to store. + @param[in] lock Nonzero value indicates that the TLB entry must be locked in the hardware TLB. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLB_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tls.h new file mode 100755 index 0000000000000..6ec3b39ff5cb0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tls.h @@ -0,0 +1,100 @@ +#ifndef QURT_TLS_H +#define QURT_TLS_H +/** + @file qurt_tls.h + @brief Prototypes of TLS APIs + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tls_create_key + @xreflabel{sec:tls_create_key} + Creates a key for accessing a thread local storage data item.\n + Subsequent get and set operations use the key value. + + @note1hang The destructor function performs any clean-up operations needed by a thread + local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}). + + @param[out] key Pointer to the newly created thread local storage key value. + @param[in] destructor Pointer to the key-specific destructor function. Passing NULL + specifies that no destructor function is defined for the key. + + @return + #QURT_EOK -- Key successfully created. \n + #QURT_ETLSAVAIL -- No free TLS key available. + + @dependencies + None. + */ +int qurt_tls_create_key (int *key, void (*destructor)(void *)); + +/**@ingroup func_qurt_tls_set_specific + Stores a data item to thread local storage along with the specified key. + + @param[in] key Thread local storage key value. + @param[in] value Pointer to user data value to store. + + @return + #QURT_EOK -- Data item successfully stored. \n + #QURT_EINVALID -- Invalid key. \n + #QURT_EFAILED -- Invoked from a non-thread context. + */ +int qurt_tls_set_specific (int key, const void *value); + +/**@ingroup func_qurt_tls_get_specific + Loads the data item from thread local storage. \n + Returns the data item that is stored in thread local storage with the specified key. + The data item is always a pointer to user data. + + @param[in] key Thread local storage key value. + + @return + Pointer -- Data item indexed by key in thread local storage. \n + 0 (NULL) -- Key out of range. + + @dependencies + None. + */ +void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key); + + +/**@ingroup func_qurt_tls_delete_key + Deletes the specified key from thread local storage. + + @note1hang Explicitly deleting a key does not execute any destructor function that is + associated with the key (Section @xref{sec:tls_create_key}). + + @param[in] key Thread local storage key value to delete. + + @return + #QURT_EOK -- Key successfully deleted. \n + #QURT_ETLSENTRY -- Key already free. + + @dependencies + None. + */ +int qurt_tls_delete_key (int key); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_trace.h new file mode 100755 index 0000000000000..541f8f1d34bf6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_trace.h @@ -0,0 +1,317 @@ +#ifndef QURT_TRACE_H +#define QURT_TRACE_H +/** + @file qurt_trace.h + @brief Prototypes of system call tracing helpers API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + GLOBAL VARIABLES +=============================================================================*/ +/** @cond internal_only */ +/** @addtogroup etm_macros +@{ */ +/* ETM trace types. */ +#define QURT_ETM_TYPE_PC_ADDR (1U<<0) /**< PC address.*/ +#define QURT_ETM_TYPE_MEMORY_ADDR (1U<<1) /**< Memory address. */ +#define QURT_ETM_TYPE_TESTBUS (1U<<2) /**< Test bus. */ +#define QURT_ETM_TYPE_CYCLE_ACCURATE (1U<<3) /**< Cycle accurate. */ +#define QURT_ETM_TYPE_CYCLE_COARSE (1U<<4) /**< Cycle coarse. */ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */ +#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */ +#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */ + +/* ETM routes. */ +#define QURT_ETM_ROUTE_TO_QDSS 0U /**< ETM route to QDSS. */ +#define QURT_ETM_ROUTE_TO_Q6ETB 1U /**< ETM route to Q6ETB. */ + +/* ETM filters. */ +#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT 0U /*< Filter all as default. */ +#define QURT_ETM_TRACE_FILTER_HNUM0 (1U<<0) /*< Filter HNUM0. */ +#define QURT_ETM_TRACE_FILTER_HNUM1 (1U<<1) /*< Filter HNUM1. */ +#define QURT_ETM_TRACE_FILTER_HNUM2 (1U<<2) /*< Filter HNUM2. */ +#define QURT_ETM_TRACE_FILTER_HNUM3 (1U<<3) /*< Filter HNUM3. */ +#define QURT_ETM_TRACE_FILTER_HNUM4 (1U<<4) /*< Filter HNUM4. */ +#define QURT_ETM_TRACE_FILTER_HNUM5 (1U<<5) /*< Filter HNUM5. */ +#define QURT_ETM_TRACE_FILTER_HNUM6 (1U<<6) /*< Filter HNUM6. */ +#define QURT_ETM_TRACE_FILTER_HNUM7 (1U<<7) /*< Filter HNUM7. */ +#define QURT_ETM_TRACE_FILTER_HNUM8 (1U<<8) /*< Filter HNUM8. */ +#define QURT_ETM_TRACE_FILTER_HNUM9 (1U<<9) /*< Filter HNUM9. */ +#define QURT_ETM_TRACE_FILTER_HNUM10 (1U<<10) /*< Filter HNUM10. */ +#define QURT_ETM_TRACE_FILTER_HNUM11 (1U<<11) /*< Filter HNUM11. */ +#define QURT_ETM_TRACE_FILTER_HNUM12 (1U<<12) /*< Filter HNUM12. */ +#define QURT_ETM_TRACE_FILTER_HNUM13 (1U<<13) /*< Filter HNUM13. */ +#define QURT_ETM_TRACE_FILTER_HNUM14 (1U<<14) /*< Filter HNUM14. */ +#define QURT_ETM_TRACE_FILTER_HNUM15 (1U<<15) /*< Filter HNUM15. */ +#define QURT_ETM_TRACE_FILTER_ALL QURT_ETM_TRACE_FILTER_ALL_DEFAULT + +#define QURT_ETM_TRACE_FILTER_CLUSTER0 (1<<16) /*< Filter trace cluster0 address. */ +#define QURT_ETM_TRACE_FILTER_CLUSTER1 (1<<17) /*< Filter trace cluster1 address. */ +#define QURT_ETM_TRACE_FILTER_PC_RANGE (1<<19) /*< Filter PC address range. */ + +/* ETM memory source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< ETM memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< ETM memory source of SAC* is data. */ + +/* Period between synchronization traces */ +#define QURT_ETM_ASYNC_PERIOD 0 /**< Async.*/ +#define QURT_ETM_ISYNC_PERIOD 1 /**< Isync.*/ +#define QURT_ETM_GSYNC_PERIOD 2 /**< Gsync. */ + +/* ETM enable flags */ +#define QURT_ETM_OFF 0U /**< ETM off. */ +#define QURT_ETM_ON 1U /**< ETM on. */ +/** @endcond */ +/** @} */ /* end_addtogroup etm_macros */ + +/** @addtogroup function_tracing_macro +@{ */ +/* ETM setup return values */ +#define QURT_ETM_SETUP_OK 0 /**< ETM setup OK. */ +#define QURT_ETM_SETUP_ERR 1 /**< ETM setup error. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* ETM breakpoint types */ +#define QURT_ETM_READWRITE_BRKPT 0U /**< ETM read/write breakpoint. */ +#define QURT_ETM_READ_BRKPT 1U /**< ETM read breakpoint. */ +#define QURT_ETM_WRITE_BRKPT 2U /**< ETM write breakpoint. */ +#define QURT_ETM_BRKPT_INVALIDATE 3U /**< Invalidate breakpoint. */ +/** @addtogroup function_tracing_macro +@{ */ +/* ATB status flags */ +#define QURT_ATB_OFF 0 /**< ATB off. */ +#define QURT_ATB_ON 1 /**< ATB on. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* DTM enable flags */ +#define QURT_DTM_OFF 0 /**< DTM off. */ +#define QURT_DTM_ON 1 /**< DTM on. */ + +/** @addtogroup function_tracing_datatypes +@{ */ +/**STM trace information. */ +typedef struct qurt_stm_trace_info { + /** @cond */ + unsigned int stm_port_addr[6]; /* STM port address to which trace data must be written.*/ + unsigned int thread_event_id; /* Event ID for context switches.*/ + unsigned int interrupt_event_id; /* Event ID for interrupts. */ + unsigned int marker; /* Marker value that must be written at the beginning of the trace. */ + /** @endcond */ +} qurt_stm_trace_info_t; +/** @} */ /* end_addtogroup function_tracing_datatypes */ +/*============================================================================= + GLOBAL FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_trace_get_marker + Gets the kernel trace marker.\n + Returns the current value of the kernel trace marker. + The marker consists of a hardware thread identifier and an index into the kernel trace + buffer. The trace buffer records kernel events. + + @note1hang Using this function with qurt_trace_changed() + determines whether certain kernel events occurred in a block of code. + + @return + Integer -- Kernel trace marker. + + @dependencies + None. +*/ +unsigned int qurt_trace_get_marker(void); + +/**@ingroup func_qurt_trace_changed + Determines whether specific kernel events have occurred. \n + Returns a value that indicates whether the specified kernel events are recorded in the + kernel trace buffer since the specified kernel trace marker was obtained. + + The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling + qurt_trace_get_marker(). + @cond rest_dist For more information on the mask value, see the description of the trace_mask element in + @xhyperref{80VB41992,80-VB419-92}. \n @endcond + + @note1hang Used with qurt_trace_get_marker(), this function determines whether + certain kernel events occurred in a block of code.\n + @note1cont This function cannot determine whether a specific kernel event type has + occurred unless that event type has been enabled in the trace_mask element + of the system configuration file. \n + @note1cont QuRT supports the recording of interrupt and context switch events only (such as + a trace_mask value of 0x3). + + @param[in] prev_trace_marker Previous kernel trace marker. + @param[in] trace_mask Mask value that indicates which kernel events to check for. + + @returns + 1 -- Kernel events of the specified type have occurred since the + specified trace marker was obtained.\n + 0 -- No kernel events of the specified type have occurred since the + specified trace marker was obtained. + + @dependencies + None. +*/ +int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask); + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup function_tracing_macro +@{ */ +#ifndef QURT_DEBUG +#define QURT_TRACE(str, ...) __VA_ARGS__ + /**< Function tracing is implemented with the QURT_TRACE debug macro, which + optionally generates printf statements both before and after every function call that is + passed as a macro argument. + + For example, in the following macro calls in the source code: + @code + QURT_TRACE(myfunc, my_func(33)) + + @endcode + generates the following debug output: + @code + myfile:nnn: my_func >>> calling my_func(33) + myfile:nnn: my_func >>> returned my_func(33) + @endcode + The debug output includes the source file and line number of the function call, along with + the text of the call. Compile the client source file with -D __FILENAME__ + defined for its file name. + + The library function qurt_printf() generates the debug output. + The QURT_DEBUG symbol controls generation of the debug output. If this symbol is + not defined, function tracing is not generated.\n + @note1hang The debug macro is accessed through the QuRT API header file. + */ +#else +#define QURT_TRACE(str, ...) \ + do { \ + qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + __VA_ARGS__; \ + qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + } while (0); +#endif +/** @} */ /* end_addtogroup function_tracing_macro */ + +/**@ingroup func_qurt_etm_set_pc_range + Sets the PC address range for ETM filtering. + Depending on the Hexagon core design, a maximum of four PC ranges are supported. + + @param[in] range_num 0 to 3. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_range + Sets the address range for ETM filtering. + It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA. + + @param[in] addr_source_type Type of the address source:\n + - #QURT_ETM_SOURCE_PC \n + - #QURT_ETM_SOURCE_DATA @tablebulletend + @param[in] trig_block_num 0 to 3. + @param[in] pid pid of the process + 1. Any valid PID number will enable the ASID based trace filtering. + 2. QURT_ETM_NO_PID - Disable the ASID based trace filtering. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_atb + Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled. + QuRT performs the corresponding actions at low power management. + + @param[in] flag Values: \n + #QURT_ATB_ON \n + #QURT_ATB_OFF + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure + + @dependencies + None. +*/ +unsigned int qurt_etm_set_atb(unsigned int flag); + +/**@ingroup func_qurt_etm_set_sync_period + Sets the period for types of synchronization trace packets. \n + ASYNC defines the period between alignment synchronization packets. + Period is in terms of bytes in the packet stream. \n + ISYNC defines the period between instruction synchronization packets. + Period is per thread and is defined as the bytes sent out for that thread. \n + GSYNC is the defined period in thread cycles between GSYNC packets. + + @param[in] sync_type Type of synchronization packets: \n + #QURT_ETM_ASYNC_PERIOD \n + #QURT_ETM_ISYNC_PERIOD \n + #QURT_ETM_GSYNC_PERIOD + @param[in] period Period value. + + @return + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. + */ +unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period); + +/**@ingroup func_qurt_stm_trace_set_config + Sets up a STM port for tracing events. + + @datatypes + #qurt_stm_trace_info_t + + @param[in] stm_config_info Pointer to the STM trace information used to set up the trace + in the kernel. + The strucure must have the following:\n + - One port address per hardware thread \n + - Event ID for context switches \n + - Event ID for interrupt tracing n + - Header or marker to identify the beginning of the trace. @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table. + + @dependencies + None. + */ +unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TRACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_types.h new file mode 100755 index 0000000000000..bdb83a3fe2fb2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_types.h @@ -0,0 +1,294 @@ +#ifndef QURT_TYPES_H +#define QURT_TYPES_H +/** + @file qurt_types.h + @brief Contains types common to all configurations + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +//#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define PGA_BITFIELD_MASK(hi,lo) (((~0u)>>(31U-((hi)-(lo))))<<(lo)) +#define PGA_BITFIELD_GET(x,hi,lo) (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo)) +#define PGA_BITFIELD_INS(hi,lo,v) (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo))) +#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v))) +#define QURT_PGATTR_C_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 3U, 0U) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 5U, 4U) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_C_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v)) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v)) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_MKRAW(v) ((qurt_pgattr_t){.pga_value = (v)}) +#define QURT_PGATTR_MK(c,a) QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a))) + +/*return types for qurt_island_get_status2*/ +#define QURT_ISLAND_MODE_NORMAL 0U /**< Normal operating mode */ +#define QURT_ISLAND_MODE_ISLAND 1U /**< Island mode */ +#define QURT_ISLAND_MODE_EXITING 2U /**< In transition from Island mode to Normal mode */ + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/** @addtogroup memory_management_types +@{ */ +typedef unsigned int qurt_addr_t; /**< QuRT address type.*/ +typedef unsigned int qurt_paddr_t; /**< QuRT physical memory address type. */ +/** @cond rest_reg_dist */ +typedef unsigned long long qurt_addr_64_t; /**< QuRT 64-bit memory address type. */ +typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */ +typedef unsigned int qurt_mem_region_t; /**< QuRT memory regions type. */ +typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */ +/**@endcond */ +typedef unsigned int qurt_mem_pool_t; /**< QuRT memory pool type.*/ +typedef unsigned int qurt_size_t; /**< QuRT size type. */ +/** @cond */ +typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */ +#define QURT_PHYSPOOL_NAME_LEN (32) +typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN]; + + +/* + * Mapping type + * + * QMEM_MAPPING_VIRTUAL is the default mode, in which the system + * picks up the available range of the virtual address, and maps it to + * available contiguous physical addresses. Physical-to-virtual + * is not guaranteed to be 1:1; both virtual and physical memory is + * contiguous. + * + * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address; + * the kernel allocates 1:1 physical-to-virtual memory. Primary use of + * of this mapping is to allocate physical-to-virtual memory 1:1. + * + * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might + * not be the same as the physical address. But the physical address of the + * memory region is guaranteed to be contiguous starting at the provided + * address, it is required to provide a fixed physical address. The primary + * use of this mapping is to allocate physical memory from a particular + * address, where 1:1 physical-to-virtual is not required. + * + * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory + * area (VMA); no physical memory is reserved or mapped to this virtual + * space; all standard qmem_region APIs apply to a VMA, however physical + * address is always INVALID_ADDR. qmem_region_create() in this mode + * returns a handle to the VMA, both virt_addr and phys_addr must + * be set to INVALID_ADDR, kernel allocates any available virtual + * memory of the specified size. Obtain the starting virtual address + * of VMA through qmem_region_attr_getvirtaddr(). + * Primary purpose of this mapping mode is to provide a mechanism for + * delayed binding in QuRT, for example reserve virtual memory and map it at + * some later time to possibly discontiguous physical blocks. Thus, a + * single VMA can be partitioned among several physical-virtual mappings + * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode. + * Each VMA keeps track of associated mapped regions. + * Deletion of VMA succeeds only if all associated "virtual_fixed" + * regions are freed prior to VMA deletion. + * + * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region + * from virtual space that has been reserved via qmem_region_create() + * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if + * phys_addr is specified, the kernel attempts to map it accordingly, + * if no phys_addr is specified, kernel maps any available physical + * memory. All standard qmem_region APIs apply to such region. Remapping + * a virtual range without prior freeing of the region is not permitted. + * When such region is deleted its corresponding VMA remains intact. + * + * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous + * virtual memory but physical memory can be discontiguous. This method + * tries to club small physical memory blocks to obtain requested + * memory and is useful in case where there is no contiguous full block + * of requested size. If client does not need contiguous physical memory, + * (for example, if client does not use physical addressing), this helps + * use smaller physical memory blocks rather than using contiguous memory. + * Note: When memory is allocated through this method, physical address is + * not returned to the caller using the qurt_mem_region_attr_get() API as there might + * not be a single physical address. + * + */ +/**@endcond */ +/** QuRT memory region mapping type. */ +typedef enum { + QURT_MEM_MAPPING_VIRTUAL=0, /**< Default mode. The region virtual address range maps to an + available contiguous area of physical memory. For the most + efficient use of virtual memory, the QuRT system + chooses the base address in physical memory. This works for most memory + use cases.*/ + QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1, /**< The region virtual address space must be mapped to a + contiguous area of physical memory. This is necessary when the + memory region is accessed by external devices that bypass Hexagon + virtual memory addressing. The base address in physical + memory must be explicitly specified.*/ + QURT_MEM_MAPPING_IDEMPOTENT=2, /**< Region virtual address space maps + to the identical area of physical memory. */ + QURT_MEM_MAPPING_VIRTUAL_FIXED=3, /**< Virtual address space of the region maps either to the + specified area of physical memory or (if no area is specified) + to available physical memory. Use this mapping to create + regions from virtual space that was reserved by calling + qurt_mem_region_create() with mapping. */ + QURT_MEM_MAPPING_NONE=4, /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not + permitted without first deleting the memory region. When such a region is + deleted, its corresponding virtual memory addressing remains intact. */ + QURT_MEM_MAPPING_VIRTUAL_RANDOM=7, /**< System chooses a random virtual address and + maps it to available contiguous physical addresses.*/ + QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical + memory blocks. This helps when there are smaller contiguous blocks + than the requested size. + Physical address is not provided as part of the get_attr call */ + QURT_MEM_MAPPING_INVALID=10, /**< Reserved as an invalid mapping type. */ +} qurt_mem_mapping_t; + + +/** QuRT cache mode type. */ +typedef enum { + QURT_MEM_CACHE_WRITEBACK=7, /**< Write back. */ + QURT_MEM_CACHE_NONE_SHARED=6, /**< Normal uncached memory that can be shared with other subsystems.*/ + QURT_MEM_CACHE_WRITETHROUGH=5, /**< Write through. */ + QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0, /**< Write back non-L2-cacheable.*/ + QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1, /**< Write through non-L2-cacheable. */ + QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK, /**< Write back L2 cacheable. */ + QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH, /**< Write through L2 cacheable. */ + QURT_MEM_CACHE_DEVICE = 4, /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/ + QURT_MEM_CACHE_NONE = 4, /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */ + QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */ + QURT_MEM_CACHE_INVALID=10, /**< Reserved as an invalid cache type. */ +} qurt_mem_cache_mode_t; + +/** Memory access permission. */ +#define QURT_PERM_NONE 0x0U /**< No permission. */ +#define QURT_PERM_READ 0x1U /**< Read permission. */ +#define QURT_PERM_WRITE 0x2U /**< Write permission. */ +#define QURT_PERM_EXECUTE 0x4U /**< Execution permission. */ +#define QURT_PERM_NODUMP 0x8U + /**< Skip dumping the mapping. During process domain dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and DSP process + crashed before the mapping is removed. */ +#define QURT_PERM_FULL QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE /**< Read, write, and execute permission. */ + +typedef unsigned char qurt_perm_t; + + +/** @cond rest_reg_dist*/ +/** QuRT cache type; specifies data cache or instruction cache. */ +typedef enum { + QURT_MEM_ICACHE, /**< Instruction cache.*/ + QURT_MEM_DCACHE /**< Data cache.*/ +} qurt_mem_cache_type_t; + +/** QuRT cache operation code type. */ +typedef enum { + QURT_MEM_CACHE_FLUSH, /**< Flush. */ + QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */ + QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */ + QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */ + QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/ +} qurt_mem_cache_op_t; + +/** QuRT memory region type. */ +typedef enum { + QURT_MEM_REGION_LOCAL=0, /**< Local. */ + QURT_MEM_REGION_SHARED=1, /**< Shared.*/ + QURT_MEM_REGION_USER_ACCESS=2, /**< User access. */ + QURT_MEM_REGION_FS=4, /**< FS. */ + QURT_MEM_REGION_INVALID=10, /**< Reserved as an invalid region type. */ +} qurt_mem_region_type_t; + +/* Cache and bus attributes are combined into a value of this type for convenience, + and macros for combining and extracting fields are defined here. */ +/** @cond */ +struct qurt_pgattr { + unsigned pga_value; /**< PGA value.*/ +}; +typedef struct qurt_pgattr qurt_pgattr_t; +/** @endcond */ +/** QuRT memory region attributes type.*/ +/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr. + virtaddr cannot be specified for a memory region, it can only be queried by the + qmem_attr_getvirtaddr() function. + */ +typedef struct { + /** @cond */ + qurt_mem_mapping_t mapping_type; + unsigned char perms; + unsigned short owner; + qurt_pgattr_t pga; + unsigned ppn; //physical page number (physical>>12) + qurt_addr_t virtaddr; + qurt_mem_region_type_t type; + qurt_size_t size; + /** @endcond */ +} qurt_mem_region_attr_t; + + +/** QuRT user physical memory pool type. */ +typedef struct { + /** @cond */ + char name[32]; + struct ranges{ + unsigned int start; + unsigned int size; + } ranges[MAX_POOL_RANGES]; + /** @endcond */ +} qurt_mem_pool_attr_t; + +/** QuRT memory pool status type.*/ +typedef struct _qurt_mem_pool_status { + + qurt_size_t contig_size; /**< Largest contiguous free memory in bytes. */ + qurt_size_t free_size; /**< Total free memory in bytes. */ + qurt_size_t total_size; /**< Total declared memory in bytes. */ + +} qurt_mem_pool_status_t; + +typedef enum { + HEXAGON_L1_I_CACHE = 0, /**< Hexagon L1 instruction cache. */ + HEXAGON_L1_D_CACHE = 1, /**< Hexagon L1 data cache. */ + HEXAGON_L2_CACHE = 2 /**< Hexagon L2 cache. */ +} qurt_cache_type_t; + +typedef enum { + FULL_SIZE = 0, /**< Fully shared cache, without partitioning. */ + HALF_SIZE = 1, /**< 1/2 for main, 1/2 for auxiliary. */ + THREE_QUARTER_SIZE = 2, /**< 3/4 for main, 1/4 for auxiliary. */ + SEVEN_EIGHTHS_SIZE = 3 /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */ +} qurt_cache_partition_size_t; + +typedef enum { + QURT_PROCESS_CB_GENERIC, /**< generic unconditional cb called after image loading. */ + QURT_PROCESS_NOTE_CB_PRE_MAP, /**< note cb called before segment loading. */ + QURT_PROCESS_NOTE_CB_POST_MAP /**< note cb called after segment loading. */ +} qurt_process_cb_type_t; + +typedef union { + void *ptr; + int num; +} qurt_process_callback_arg_t; + + +/**@endcond*/ + +/** @} */ /* end_addtogroup memory_management_types */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TYPES_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_user_dma.h new file mode 100755 index 0000000000000..e05a6429fd703 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_user_dma.h @@ -0,0 +1,44 @@ +#ifndef QURT_USER_DMA_H +#define QURT_USER_DMA_H + +/** + @file qurt_user_dma.h + @brief Definitions, macros, and prototypes used for handling user DMA. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_user_dma_dmsyncht + Sends the DMSyncht command to the user DMA engine. + + Call this function to ensure all posted DMA memory operations are + complete. + + This stalls the current thread until the instruction + is complete and returns. + + @return + QURT_EOK - On dmsyncht completion \n + QURT_ENOTSUPPORTED - User DMA not supported + + @dependencies + None. +*/ +int qurt_user_dma_dmsyncht(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_vtlb.h new file mode 100755 index 0000000000000..e064042e447ac --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_vtlb.h @@ -0,0 +1,76 @@ +/*============================================================================= + + qurt_vtlb.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef QURT_VTLB_H +#define QURT_VTLB_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Names starting with "qurt_i_vtlb" are the internal low-level functions. +|| These should be considered subject to change. +*/ + +int qurt_i_vtlb_entry_create(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension); + +int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension, + unsigned target_pid); + +int qurt_i_vtlb_entry_delete(unsigned index); + +int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo); + +int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension); + +int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid); + +int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex); + +int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid); + + +int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries + // stats[1] -- number of available VTLB entries + // stats[2] -- max size of VTLB tree since boot + +//can return index to an entry that was specialed, change it to take addresses instead of pages +int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size); + +int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index); + +#define QURT_VTLB_EXT_DEFAULT 0U +#define QURT_VTLB_EXT_LOCKED 1U +#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U /* Temporary ability to skip certain mappings in pd dump */ +#define QURT_VTLB_EXT_FREELIST 0x800000u + +#define QURT_VTLB_ERR_OVERLAP -64 +#define QURT_VTLB_ERR_TREE_NO_SPACE -65 +#define QURT_VTLB_ERR_INVALID_SIZE -68 +#define QURT_VTLB_ERR_INVALID_EXT -69 +#define QURT_VTLB_ERR_DEL_PGT_LOCKED -70 +#define QURT_VTLB_ERR_PGT_LOCK_CNT -71 + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_VTLB_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libposix.a new file mode 100755 index 0000000000000..f338fbee708ef Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurt.a new file mode 100755 index 0000000000000..e35606134ddfa Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurtcfs.a new file mode 100755 index 0000000000000..02250fa425ac4 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_island.a new file mode 100755 index 0000000000000..bce4fe8cc49b2 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_island.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_main.a new file mode 100755 index 0000000000000..041565908f9c6 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_main.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libposix.a new file mode 100755 index 0000000000000..044c93bb65797 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurt.a new file mode 100755 index 0000000000000..a91e0fbb660b7 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurtcfs.a new file mode 100755 index 0000000000000..02250fa425ac4 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libtimer.a new file mode 100755 index 0000000000000..10bc3e63c2efc Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libtimer.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/confname.h new file mode 100755 index 0000000000000..d9ca3135501e3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/confname.h @@ -0,0 +1,528 @@ +#ifndef CONFNAME_H +#define CONFNAME_H +/** + @file confname.h + @brief Named literals for 'name' argument of sysconf, pathconf + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly. Instead include unistd.h. For now since + toolchain doesnt provide a hook by including bits/confname.h, we stick this + header in QuRT's sys/types.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +/* Values for the NAME argument to `pathconf' and `fpathconf'. */ +enum +{ + _PC_LINK_MAX, +#define _PC_LINK_MAX _PC_LINK_MAX + _PC_MAX_CANON, +#define _PC_MAX_CANON _PC_MAX_CANON + _PC_MAX_INPUT, +#define _PC_MAX_INPUT _PC_MAX_INPUT + _PC_NAME_MAX, +#define _PC_NAME_MAX _PC_NAME_MAX + _PC_PATH_MAX, +#define _PC_PATH_MAX _PC_PATH_MAX + _PC_PIPE_BUF, +#define _PC_PIPE_BUF _PC_PIPE_BUF + _PC_CHOWN_RESTRICTED, +#define _PC_CHOWN_RESTRICTED _PC_CHOWN_RESTRICTED + _PC_NO_TRUNC, +#define _PC_NO_TRUNC _PC_NO_TRUNC + _PC_VDISABLE, +#define _PC_VDISABLE _PC_VDISABLE + _PC_SYNC_IO, +#define _PC_SYNC_IO _PC_SYNC_IO + _PC_ASYNC_IO, +#define _PC_ASYNC_IO _PC_ASYNC_IO + _PC_PRIO_IO, +#define _PC_PRIO_IO _PC_PRIO_IO + _PC_SOCK_MAXBUF, +#define _PC_SOCK_MAXBUF _PC_SOCK_MAXBUF + _PC_FILESIZEBITS, +#define _PC_FILESIZEBITS _PC_FILESIZEBITS + _PC_REC_INCR_XFER_SIZE, +#define _PC_REC_INCR_XFER_SIZE _PC_REC_INCR_XFER_SIZE + _PC_REC_MAX_XFER_SIZE, +#define _PC_REC_MAX_XFER_SIZE _PC_REC_MAX_XFER_SIZE + _PC_REC_MIN_XFER_SIZE, +#define _PC_REC_MIN_XFER_SIZE _PC_REC_MIN_XFER_SIZE + _PC_REC_XFER_ALIGN, +#define _PC_REC_XFER_ALIGN _PC_REC_XFER_ALIGN + _PC_ALLOC_SIZE_MIN, +#define _PC_ALLOC_SIZE_MIN _PC_ALLOC_SIZE_MIN + _PC_SYMLINK_MAX, +#define _PC_SYMLINK_MAX _PC_SYMLINK_MAX + _PC_2_SYMLINKS +#define _PC_2_SYMLINKS _PC_2_SYMLINKS +}; + +/* Values for the argument to `sysconf'. */ +enum +{ + _SC_ARG_MAX, +#define _SC_ARG_MAX _SC_ARG_MAX + _SC_CHILD_MAX, +#define _SC_CHILD_MAX _SC_CHILD_MAX + _SC_CLK_TCK, +#define _SC_CLK_TCK _SC_CLK_TCK + _SC_NGROUPS_MAX, +#define _SC_NGROUPS_MAX _SC_NGROUPS_MAX + _SC_OPEN_MAX, +#define _SC_OPEN_MAX _SC_OPEN_MAX + _SC_STREAM_MAX, +#define _SC_STREAM_MAX _SC_STREAM_MAX + _SC_TZNAME_MAX, +#define _SC_TZNAME_MAX _SC_TZNAME_MAX + _SC_JOB_CONTROL, +#define _SC_JOB_CONTROL _SC_JOB_CONTROL + _SC_SAVED_IDS, +#define _SC_SAVED_IDS _SC_SAVED_IDS + _SC_REALTIME_SIGNALS, +#define _SC_REALTIME_SIGNALS _SC_REALTIME_SIGNALS + _SC_PRIORITY_SCHEDULING, +#define _SC_PRIORITY_SCHEDULING _SC_PRIORITY_SCHEDULING + _SC_TIMERS, +#define _SC_TIMERS _SC_TIMERS + _SC_ASYNCHRONOUS_IO, +#define _SC_ASYNCHRONOUS_IO _SC_ASYNCHRONOUS_IO + _SC_PRIORITIZED_IO, +#define _SC_PRIORITIZED_IO _SC_PRIORITIZED_IO + _SC_SYNCHRONIZED_IO, +#define _SC_SYNCHRONIZED_IO _SC_SYNCHRONIZED_IO + _SC_FSYNC, +#define _SC_FSYNC _SC_FSYNC + _SC_MAPPED_FILES, +#define _SC_MAPPED_FILES _SC_MAPPED_FILES + _SC_MEMLOCK, +#define _SC_MEMLOCK _SC_MEMLOCK + _SC_MEMLOCK_RANGE, +#define _SC_MEMLOCK_RANGE _SC_MEMLOCK_RANGE + _SC_MEMORY_PROTECTION, +#define _SC_MEMORY_PROTECTION _SC_MEMORY_PROTECTION + _SC_MESSAGE_PASSING, +#define _SC_MESSAGE_PASSING _SC_MESSAGE_PASSING + _SC_SEMAPHORES, +#define _SC_SEMAPHORES _SC_SEMAPHORES + _SC_SHARED_MEMORY_OBJECTS, +#define _SC_SHARED_MEMORY_OBJECTS _SC_SHARED_MEMORY_OBJECTS + _SC_AIO_LISTIO_MAX, +#define _SC_AIO_LISTIO_MAX _SC_AIO_LISTIO_MAX + _SC_AIO_MAX, +#define _SC_AIO_MAX _SC_AIO_MAX + _SC_AIO_PRIO_DELTA_MAX, +#define _SC_AIO_PRIO_DELTA_MAX _SC_AIO_PRIO_DELTA_MAX + _SC_DELAYTIMER_MAX, +#define _SC_DELAYTIMER_MAX _SC_DELAYTIMER_MAX + _SC_MQ_OPEN_MAX, +#define _SC_MQ_OPEN_MAX _SC_MQ_OPEN_MAX + _SC_MQ_PRIO_MAX, +#define _SC_MQ_PRIO_MAX _SC_MQ_PRIO_MAX + _SC_VERSION, +#define _SC_VERSION _SC_VERSION + _SC_PAGESIZE, +#define _SC_PAGESIZE _SC_PAGESIZE +#define _SC_PAGE_SIZE _SC_PAGESIZE + _SC_RTSIG_MAX, +#define _SC_RTSIG_MAX _SC_RTSIG_MAX + _SC_SEM_NSEMS_MAX, +#define _SC_SEM_NSEMS_MAX _SC_SEM_NSEMS_MAX + _SC_SEM_VALUE_MAX, +#define _SC_SEM_VALUE_MAX _SC_SEM_VALUE_MAX + _SC_SIGQUEUE_MAX, +#define _SC_SIGQUEUE_MAX _SC_SIGQUEUE_MAX + _SC_TIMER_MAX, +#define _SC_TIMER_MAX _SC_TIMER_MAX + + /* Values for the argument to `sysconf' + corresponding to _POSIX2_* symbols. */ + _SC_BC_BASE_MAX, +#define _SC_BC_BASE_MAX _SC_BC_BASE_MAX + _SC_BC_DIM_MAX, +#define _SC_BC_DIM_MAX _SC_BC_DIM_MAX + _SC_BC_SCALE_MAX, +#define _SC_BC_SCALE_MAX _SC_BC_SCALE_MAX + _SC_BC_STRING_MAX, +#define _SC_BC_STRING_MAX _SC_BC_STRING_MAX + _SC_COLL_WEIGHTS_MAX, +#define _SC_COLL_WEIGHTS_MAX _SC_COLL_WEIGHTS_MAX + _SC_EQUIV_CLASS_MAX, +#define _SC_EQUIV_CLASS_MAX _SC_EQUIV_CLASS_MAX + _SC_EXPR_NEST_MAX, +#define _SC_EXPR_NEST_MAX _SC_EXPR_NEST_MAX + _SC_LINE_MAX, +#define _SC_LINE_MAX _SC_LINE_MAX + _SC_RE_DUP_MAX, +#define _SC_RE_DUP_MAX _SC_RE_DUP_MAX + _SC_CHARCLASS_NAME_MAX, +#define _SC_CHARCLASS_NAME_MAX _SC_CHARCLASS_NAME_MAX + + _SC_2_VERSION, +#define _SC_2_VERSION _SC_2_VERSION + _SC_2_C_BIND, +#define _SC_2_C_BIND _SC_2_C_BIND + _SC_2_C_DEV, +#define _SC_2_C_DEV _SC_2_C_DEV + _SC_2_FORT_DEV, +#define _SC_2_FORT_DEV _SC_2_FORT_DEV + _SC_2_FORT_RUN, +#define _SC_2_FORT_RUN _SC_2_FORT_RUN + _SC_2_SW_DEV, +#define _SC_2_SW_DEV _SC_2_SW_DEV + _SC_2_LOCALEDEF, +#define _SC_2_LOCALEDEF _SC_2_LOCALEDEF + + _SC_PII, +#define _SC_PII _SC_PII + _SC_PII_XTI, +#define _SC_PII_XTI _SC_PII_XTI + _SC_PII_SOCKET, +#define _SC_PII_SOCKET _SC_PII_SOCKET + _SC_PII_INTERNET, +#define _SC_PII_INTERNET _SC_PII_INTERNET + _SC_PII_OSI, +#define _SC_PII_OSI _SC_PII_OSI + _SC_POLL, +#define _SC_POLL _SC_POLL + _SC_SELECT, +#define _SC_SELECT _SC_SELECT + _SC_UIO_MAXIOV, +#define _SC_UIO_MAXIOV _SC_UIO_MAXIOV + _SC_IOV_MAX = _SC_UIO_MAXIOV, +#define _SC_IOV_MAX _SC_IOV_MAX + _SC_PII_INTERNET_STREAM, +#define _SC_PII_INTERNET_STREAM _SC_PII_INTERNET_STREAM + _SC_PII_INTERNET_DGRAM, +#define _SC_PII_INTERNET_DGRAM _SC_PII_INTERNET_DGRAM + _SC_PII_OSI_COTS, +#define _SC_PII_OSI_COTS _SC_PII_OSI_COTS + _SC_PII_OSI_CLTS, +#define _SC_PII_OSI_CLTS _SC_PII_OSI_CLTS + _SC_PII_OSI_M, +#define _SC_PII_OSI_M _SC_PII_OSI_M + _SC_T_IOV_MAX, +#define _SC_T_IOV_MAX _SC_T_IOV_MAX + + /* Values according to POSIX 1003.1c (POSIX threads). */ + _SC_THREADS, +#define _SC_THREADS _SC_THREADS + _SC_THREAD_SAFE_FUNCTIONS, +#define _SC_THREAD_SAFE_FUNCTIONS _SC_THREAD_SAFE_FUNCTIONS + _SC_GETGR_R_SIZE_MAX, +#define _SC_GETGR_R_SIZE_MAX _SC_GETGR_R_SIZE_MAX + _SC_GETPW_R_SIZE_MAX, +#define _SC_GETPW_R_SIZE_MAX _SC_GETPW_R_SIZE_MAX + _SC_LOGIN_NAME_MAX, +#define _SC_LOGIN_NAME_MAX _SC_LOGIN_NAME_MAX + _SC_TTY_NAME_MAX, +#define _SC_TTY_NAME_MAX _SC_TTY_NAME_MAX + _SC_THREAD_DESTRUCTOR_ITERATIONS, +#define _SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS + _SC_THREAD_KEYS_MAX, +#define _SC_THREAD_KEYS_MAX _SC_THREAD_KEYS_MAX + _SC_THREAD_STACK_MIN, +#define _SC_THREAD_STACK_MIN _SC_THREAD_STACK_MIN + _SC_THREAD_THREADS_MAX, +#define _SC_THREAD_THREADS_MAX _SC_THREAD_THREADS_MAX + _SC_THREAD_ATTR_STACKADDR, +#define _SC_THREAD_ATTR_STACKADDR _SC_THREAD_ATTR_STACKADDR + _SC_THREAD_ATTR_STACKSIZE, +#define _SC_THREAD_ATTR_STACKSIZE _SC_THREAD_ATTR_STACKSIZE + _SC_THREAD_PRIORITY_SCHEDULING, +#define _SC_THREAD_PRIORITY_SCHEDULING _SC_THREAD_PRIORITY_SCHEDULING + _SC_THREAD_PRIO_INHERIT, +#define _SC_THREAD_PRIO_INHERIT _SC_THREAD_PRIO_INHERIT + _SC_THREAD_PRIO_PROTECT, +#define _SC_THREAD_PRIO_PROTECT _SC_THREAD_PRIO_PROTECT + _SC_THREAD_PROCESS_SHARED, +#define _SC_THREAD_PROCESS_SHARED _SC_THREAD_PROCESS_SHARED + + _SC_NPROCESSORS_CONF, +#define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_CONF + _SC_NPROCESSORS_ONLN, +#define _SC_NPROCESSORS_ONLN _SC_NPROCESSORS_ONLN + _SC_PHYS_PAGES, +#define _SC_PHYS_PAGES _SC_PHYS_PAGES + _SC_AVPHYS_PAGES, +#define _SC_AVPHYS_PAGES _SC_AVPHYS_PAGES + _SC_ATEXIT_MAX, +#define _SC_ATEXIT_MAX _SC_ATEXIT_MAX + _SC_PASS_MAX, +#define _SC_PASS_MAX _SC_PASS_MAX + + _SC_XOPEN_VERSION, +#define _SC_XOPEN_VERSION _SC_XOPEN_VERSION + _SC_XOPEN_XCU_VERSION, +#define _SC_XOPEN_XCU_VERSION _SC_XOPEN_XCU_VERSION + _SC_XOPEN_UNIX, +#define _SC_XOPEN_UNIX _SC_XOPEN_UNIX + _SC_XOPEN_CRYPT, +#define _SC_XOPEN_CRYPT _SC_XOPEN_CRYPT + _SC_XOPEN_ENH_I18N, +#define _SC_XOPEN_ENH_I18N _SC_XOPEN_ENH_I18N + _SC_XOPEN_SHM, +#define _SC_XOPEN_SHM _SC_XOPEN_SHM + + _SC_2_CHAR_TERM, +#define _SC_2_CHAR_TERM _SC_2_CHAR_TERM + _SC_2_C_VERSION, +#define _SC_2_C_VERSION _SC_2_C_VERSION + _SC_2_UPE, +#define _SC_2_UPE _SC_2_UPE + + _SC_XOPEN_XPG2, +#define _SC_XOPEN_XPG2 _SC_XOPEN_XPG2 + _SC_XOPEN_XPG3, +#define _SC_XOPEN_XPG3 _SC_XOPEN_XPG3 + _SC_XOPEN_XPG4, +#define _SC_XOPEN_XPG4 _SC_XOPEN_XPG4 + + _SC_CHAR_BIT, +#define _SC_CHAR_BIT _SC_CHAR_BIT + _SC_CHAR_MAX, +#define _SC_CHAR_MAX _SC_CHAR_MAX + _SC_CHAR_MIN, +#define _SC_CHAR_MIN _SC_CHAR_MIN + _SC_INT_MAX, +#define _SC_INT_MAX _SC_INT_MAX + _SC_INT_MIN, +#define _SC_INT_MIN _SC_INT_MIN + _SC_LONG_BIT, +#define _SC_LONG_BIT _SC_LONG_BIT + _SC_WORD_BIT, +#define _SC_WORD_BIT _SC_WORD_BIT + _SC_MB_LEN_MAX, +#define _SC_MB_LEN_MAX _SC_MB_LEN_MAX + _SC_NZERO, +#define _SC_NZERO _SC_NZERO + _SC_SSIZE_MAX, +#define _SC_SSIZE_MAX _SC_SSIZE_MAX + _SC_SCHAR_MAX, +#define _SC_SCHAR_MAX _SC_SCHAR_MAX + _SC_SCHAR_MIN, +#define _SC_SCHAR_MIN _SC_SCHAR_MIN + _SC_SHRT_MAX, +#define _SC_SHRT_MAX _SC_SHRT_MAX + _SC_SHRT_MIN, +#define _SC_SHRT_MIN _SC_SHRT_MIN + _SC_UCHAR_MAX, +#define _SC_UCHAR_MAX _SC_UCHAR_MAX + _SC_UINT_MAX, +#define _SC_UINT_MAX _SC_UINT_MAX + _SC_ULONG_MAX, +#define _SC_ULONG_MAX _SC_ULONG_MAX + _SC_USHRT_MAX, +#define _SC_USHRT_MAX _SC_USHRT_MAX + + _SC_NL_ARGMAX, +#define _SC_NL_ARGMAX _SC_NL_ARGMAX + _SC_NL_LANGMAX, +#define _SC_NL_LANGMAX _SC_NL_LANGMAX + _SC_NL_MSGMAX, +#define _SC_NL_MSGMAX _SC_NL_MSGMAX + _SC_NL_NMAX, +#define _SC_NL_NMAX _SC_NL_NMAX + _SC_NL_SETMAX, +#define _SC_NL_SETMAX _SC_NL_SETMAX + _SC_NL_TEXTMAX, +#define _SC_NL_TEXTMAX _SC_NL_TEXTMAX + + _SC_XBS5_ILP32_OFF32, +#define _SC_XBS5_ILP32_OFF32 _SC_XBS5_ILP32_OFF32 + _SC_XBS5_ILP32_OFFBIG, +#define _SC_XBS5_ILP32_OFFBIG _SC_XBS5_ILP32_OFFBIG + _SC_XBS5_LP64_OFF64, +#define _SC_XBS5_LP64_OFF64 _SC_XBS5_LP64_OFF64 + _SC_XBS5_LPBIG_OFFBIG, +#define _SC_XBS5_LPBIG_OFFBIG _SC_XBS5_LPBIG_OFFBIG + + _SC_XOPEN_LEGACY, +#define _SC_XOPEN_LEGACY _SC_XOPEN_LEGACY + _SC_XOPEN_REALTIME, +#define _SC_XOPEN_REALTIME _SC_XOPEN_REALTIME + _SC_XOPEN_REALTIME_THREADS, +#define _SC_XOPEN_REALTIME_THREADS _SC_XOPEN_REALTIME_THREADS + + _SC_ADVISORY_INFO, +#define _SC_ADVISORY_INFO _SC_ADVISORY_INFO + _SC_BARRIERS, +#define _SC_BARRIERS _SC_BARRIERS + _SC_BASE, +#define _SC_BASE _SC_BASE + _SC_C_LANG_SUPPORT, +#define _SC_C_LANG_SUPPORT _SC_C_LANG_SUPPORT + _SC_C_LANG_SUPPORT_R, +#define _SC_C_LANG_SUPPORT_R _SC_C_LANG_SUPPORT_R + _SC_CLOCK_SELECTION, +#define _SC_CLOCK_SELECTION _SC_CLOCK_SELECTION + _SC_CPUTIME, +#define _SC_CPUTIME _SC_CPUTIME + _SC_THREAD_CPUTIME, +#define _SC_THREAD_CPUTIME _SC_THREAD_CPUTIME + _SC_DEVICE_IO, +#define _SC_DEVICE_IO _SC_DEVICE_IO + _SC_DEVICE_SPECIFIC, +#define _SC_DEVICE_SPECIFIC _SC_DEVICE_SPECIFIC + _SC_DEVICE_SPECIFIC_R, +#define _SC_DEVICE_SPECIFIC_R _SC_DEVICE_SPECIFIC_R + _SC_FD_MGMT, +#define _SC_FD_MGMT _SC_FD_MGMT + _SC_FIFO, +#define _SC_FIFO _SC_FIFO + _SC_PIPE, +#define _SC_PIPE _SC_PIPE + _SC_FILE_ATTRIBUTES, +#define _SC_FILE_ATTRIBUTES _SC_FILE_ATTRIBUTES + _SC_FILE_LOCKING, +#define _SC_FILE_LOCKING _SC_FILE_LOCKING + _SC_FILE_SYSTEM, +#define _SC_FILE_SYSTEM _SC_FILE_SYSTEM + _SC_MONOTONIC_CLOCK, +#define _SC_MONOTONIC_CLOCK _SC_MONOTONIC_CLOCK + _SC_MULTI_PROCESS, +#define _SC_MULTI_PROCESS _SC_MULTI_PROCESS + _SC_SINGLE_PROCESS, +#define _SC_SINGLE_PROCESS _SC_SINGLE_PROCESS + _SC_NETWORKING, +#define _SC_NETWORKING _SC_NETWORKING + _SC_READER_WRITER_LOCKS, +#define _SC_READER_WRITER_LOCKS _SC_READER_WRITER_LOCKS + _SC_SPIN_LOCKS, +#define _SC_SPIN_LOCKS _SC_SPIN_LOCKS + _SC_REGEXP, +#define _SC_REGEXP _SC_REGEXP + _SC_REGEX_VERSION, +#define _SC_REGEX_VERSION _SC_REGEX_VERSION + _SC_SHELL, +#define _SC_SHELL _SC_SHELL + _SC_SIGNALS, +#define _SC_SIGNALS _SC_SIGNALS + _SC_SPAWN, +#define _SC_SPAWN _SC_SPAWN + _SC_SPORADIC_SERVER, +#define _SC_SPORADIC_SERVER _SC_SPORADIC_SERVER + _SC_THREAD_SPORADIC_SERVER, +#define _SC_THREAD_SPORADIC_SERVER _SC_THREAD_SPORADIC_SERVER + _SC_SYSTEM_DATABASE, +#define _SC_SYSTEM_DATABASE _SC_SYSTEM_DATABASE + _SC_SYSTEM_DATABASE_R, +#define _SC_SYSTEM_DATABASE_R _SC_SYSTEM_DATABASE_R + _SC_TIMEOUTS, +#define _SC_TIMEOUTS _SC_TIMEOUTS + _SC_TYPED_MEMORY_OBJECTS, +#define _SC_TYPED_MEMORY_OBJECTS _SC_TYPED_MEMORY_OBJECTS + _SC_USER_GROUPS, +#define _SC_USER_GROUPS _SC_USER_GROUPS + _SC_USER_GROUPS_R, +#define _SC_USER_GROUPS_R _SC_USER_GROUPS_R + _SC_2_PBS, +#define _SC_2_PBS _SC_2_PBS + _SC_2_PBS_ACCOUNTING, +#define _SC_2_PBS_ACCOUNTING _SC_2_PBS_ACCOUNTING + _SC_2_PBS_LOCATE, +#define _SC_2_PBS_LOCATE _SC_2_PBS_LOCATE + _SC_2_PBS_MESSAGE, +#define _SC_2_PBS_MESSAGE _SC_2_PBS_MESSAGE + _SC_2_PBS_TRACK, +#define _SC_2_PBS_TRACK _SC_2_PBS_TRACK + _SC_SYMLOOP_MAX, +#define _SC_SYMLOOP_MAX _SC_SYMLOOP_MAX + _SC_STREAMS, +#define _SC_STREAMS _SC_STREAMS + _SC_2_PBS_CHECKPOINT, +#define _SC_2_PBS_CHECKPOINT _SC_2_PBS_CHECKPOINT + + _SC_V6_ILP32_OFF32, +#define _SC_V6_ILP32_OFF32 _SC_V6_ILP32_OFF32 + _SC_V6_ILP32_OFFBIG, +#define _SC_V6_ILP32_OFFBIG _SC_V6_ILP32_OFFBIG + _SC_V6_LP64_OFF64, +#define _SC_V6_LP64_OFF64 _SC_V6_LP64_OFF64 + _SC_V6_LPBIG_OFFBIG, +#define _SC_V6_LPBIG_OFFBIG _SC_V6_LPBIG_OFFBIG + + _SC_HOST_NAME_MAX, +#define _SC_HOST_NAME_MAX _SC_HOST_NAME_MAX + _SC_TRACE, +#define _SC_TRACE _SC_TRACE + _SC_TRACE_EVENT_FILTER, +#define _SC_TRACE_EVENT_FILTER _SC_TRACE_EVENT_FILTER + _SC_TRACE_INHERIT, +#define _SC_TRACE_INHERIT _SC_TRACE_INHERIT + _SC_TRACE_LOG, +#define _SC_TRACE_LOG _SC_TRACE_LOG + + _SC_LEVEL1_ICACHE_SIZE, +#define _SC_LEVEL1_ICACHE_SIZE _SC_LEVEL1_ICACHE_SIZE + _SC_LEVEL1_ICACHE_ASSOC, +#define _SC_LEVEL1_ICACHE_ASSOC _SC_LEVEL1_ICACHE_ASSOC + _SC_LEVEL1_ICACHE_LINESIZE, +#define _SC_LEVEL1_ICACHE_LINESIZE _SC_LEVEL1_ICACHE_LINESIZE + _SC_LEVEL1_DCACHE_SIZE, +#define _SC_LEVEL1_DCACHE_SIZE _SC_LEVEL1_DCACHE_SIZE + _SC_LEVEL1_DCACHE_ASSOC, +#define _SC_LEVEL1_DCACHE_ASSOC _SC_LEVEL1_DCACHE_ASSOC + _SC_LEVEL1_DCACHE_LINESIZE, +#define _SC_LEVEL1_DCACHE_LINESIZE _SC_LEVEL1_DCACHE_LINESIZE + _SC_LEVEL2_CACHE_SIZE, +#define _SC_LEVEL2_CACHE_SIZE _SC_LEVEL2_CACHE_SIZE + _SC_LEVEL2_CACHE_ASSOC, +#define _SC_LEVEL2_CACHE_ASSOC _SC_LEVEL2_CACHE_ASSOC + _SC_LEVEL2_CACHE_LINESIZE, +#define _SC_LEVEL2_CACHE_LINESIZE _SC_LEVEL2_CACHE_LINESIZE + _SC_LEVEL3_CACHE_SIZE, +#define _SC_LEVEL3_CACHE_SIZE _SC_LEVEL3_CACHE_SIZE + _SC_LEVEL3_CACHE_ASSOC, +#define _SC_LEVEL3_CACHE_ASSOC _SC_LEVEL3_CACHE_ASSOC + _SC_LEVEL3_CACHE_LINESIZE, +#define _SC_LEVEL3_CACHE_LINESIZE _SC_LEVEL3_CACHE_LINESIZE + _SC_LEVEL4_CACHE_SIZE, +#define _SC_LEVEL4_CACHE_SIZE _SC_LEVEL4_CACHE_SIZE + _SC_LEVEL4_CACHE_ASSOC, +#define _SC_LEVEL4_CACHE_ASSOC _SC_LEVEL4_CACHE_ASSOC + _SC_LEVEL4_CACHE_LINESIZE, +#define _SC_LEVEL4_CACHE_LINESIZE _SC_LEVEL4_CACHE_LINESIZE + /* Leave room here, maybe we need a few more cache levels some day. */ + + _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50, +#define _SC_IPV6 _SC_IPV6 + _SC_RAW_SOCKETS, +#define _SC_RAW_SOCKETS _SC_RAW_SOCKETS + + _SC_V7_ILP32_OFF32, +#define _SC_V7_ILP32_OFF32 _SC_V7_ILP32_OFF32 + _SC_V7_ILP32_OFFBIG, +#define _SC_V7_ILP32_OFFBIG _SC_V7_ILP32_OFFBIG + _SC_V7_LP64_OFF64, +#define _SC_V7_LP64_OFF64 _SC_V7_LP64_OFF64 + _SC_V7_LPBIG_OFFBIG, +#define _SC_V7_LPBIG_OFFBIG _SC_V7_LPBIG_OFFBIG + + _SC_SS_REPL_MAX, +#define _SC_SS_REPL_MAX _SC_SS_REPL_MAX + + _SC_TRACE_EVENT_NAME_MAX, +#define _SC_TRACE_EVENT_NAME_MAX _SC_TRACE_EVENT_NAME_MAX + _SC_TRACE_NAME_MAX, +#define _SC_TRACE_NAME_MAX _SC_TRACE_NAME_MAX + _SC_TRACE_SYS_MAX, +#define _SC_TRACE_SYS_MAX _SC_TRACE_SYS_MAX + _SC_TRACE_USER_EVENT_MAX, +#define _SC_TRACE_USER_EVENT_MAX _SC_TRACE_USER_EVENT_MAX + + _SC_XOPEN_STREAMS, +#define _SC_XOPEN_STREAMS _SC_XOPEN_STREAMS + + _SC_THREAD_ROBUST_PRIO_INHERIT, +#define _SC_THREAD_ROBUST_PRIO_INHERIT _SC_THREAD_ROBUST_PRIO_INHERIT + _SC_THREAD_ROBUST_PRIO_PROTECT +#define _SC_THREAD_ROBUST_PRIO_PROTECT _SC_THREAD_ROBUST_PRIO_PROTECT + +}; +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/posix1_lim.h new file mode 100755 index 0000000000000..0739958c5a6c4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/posix1_lim.h @@ -0,0 +1,34 @@ +#ifndef POSIX1_LIM_H +#define POSIX1_LIM_H +/** + @file posix1_lim.h + @brief POSIX Minimum values + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +TODO + This header should be ideally relocated under api/posix/bits (something that + doesnt exist today) and be included from api/posix/bits/limits.h which inturn + should be included from toolchain's limits.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +#ifndef _POSIX_PATH_MAX +/** @brief Maximum number of bytes in a pathname, including the terminating + nul character */ +#define _POSIX_PATH_MAX 256 +#endif + +#ifndef _POSIX_SEM_NSEMS_MAX +/** @brief Maximum number of semaphores that a process may have */ +#define _POSIX_SEM_NSEMS_MAX 16 +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/common/time.h new file mode 100755 index 0000000000000..76b0d39ab7039 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/common/time.h @@ -0,0 +1 @@ +#include \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/fcntl.h new file mode 100755 index 0000000000000..c80ec98a449b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/fcntl.h @@ -0,0 +1,51 @@ +#ifndef _FCNTL_H +#define _FCNTL_H + +/*========================================================================== + * FILE: fcntl.h + * + * SERVICES: POSIX fcntl.h + * + * DESCRIPTION: The header is needed by the open() and fcntl() + * system calls, which have a variety of parameters and + * flags. They are described here. + * + * The formats of the calls to each of these are: + * + * open(path, oflag [,mode]) open a file + * fcntl(fd, cmd [,arg]) get or set file attributes + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Oflag values for open(). POSIX Table 6-4. */ +#define POSIX_O_CREAT 0x100 /* creat file if it doesn't exist */ +#define POSIX_O_EXCL 0x200 /* exclusive use flag */ +#define POSIX_O_NOCTTY 0x400 /* do not assign a controlling terminal */ +#define POSIX_O_TRUNC 0x1000 /* truncate flag */ + +/* File status flags for open() and fcntl(). POSIX Table 6-5. */ +#define POSIX_O_APPEND 0x2000 /* set append mode */ +#define POSIX_O_NONBLOCK 0x4000 /* no delay */ + +/* File access modes for open() and fcntl(). POSIX Table 6-6. */ +#define POSIX_O_RDONLY 0 /* open(name, POSIX_O_RDONLY) opens read only */ +#define POSIX_O_WRONLY 1 /* open(name, POSIX_O_WRONLY) opens write only */ +#define POSIX_O_RDWR 2 /* open(name, POSIX_O_RDWR) opens read/write */ + +/* Mask for use with file access modes. POSIX Table 6-7. */ +#define POSIX_O_ACCMODE 0x3 /* mask for file access modes */ + +#ifdef __cplusplus +} +#endif + +#endif /* _FCNTL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/hooks/unistd.h new file mode 100755 index 0000000000000..1c618bfe36b4f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/hooks/unistd.h @@ -0,0 +1,115 @@ +#ifndef UNISTD_H +#define UNISTD_H +/** + @file posix/hooks/unistd.h + @brief POSIX related declarations in that are missing in toolchain + header + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly! Instead include unistd.h. + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include /* For various POSIX ID types from toolchain headers */ + +#ifdef __cplusplus +extern "C" { +#endif +extern long pathconf (char const * path, int name); + +/* Process*/ + +/** The getppid() function shall return the parent process ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the parent process ID + */ +pid_t getppid(void); + +/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid + * Please refer to POSIX standard for details. + * @param thread [in] process ID + * @param value_ptr [out] process group ID + */ +pid_t getpgid(pid_t pid); + +/** The getpgrp() function shall return the process group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] process group ID of the calling process + */ +pid_t getpgrp(void); + +/**The getuid() function shall return the real user ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the real user ID of the calling process. + */ +uid_t getuid(void); + +/** The geteuid() function shall return the effective user ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective user ID of the calling process + */ +uid_t geteuid(void); + +/** The getegid() function shall return the effective group ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective group ID of the calling process. + */ +gid_t getegid(void); + +/** The getgid() function shall return the real group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] real group ID of the calling process. + */ + gid_t getgid(void); + +/** seteuid set effective user ID + * Please refer to POSIX standard for details. + * @param thread [in] effective user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int seteuid(uid_t uid); + +/** setpgrp - set the process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setpgrp(void); + +/** setuid - set user ID + * Please refer to POSIX standard for details. + * @param thread [in] user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setuid(uid_t uid); + +/** setpgid - set process group ID for job control + * Please refer to POSIX standard for details. + * @param thread [in] PID of process, PGID to be set + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setpgid(pid_t pid, pid_t pgid); + +/** setsid - create session and set process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setsid(void); + +#ifdef __cplusplus +} +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/mqueue.h new file mode 100755 index 0000000000000..74dcc2fa202c6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/mqueue.h @@ -0,0 +1,203 @@ +#ifndef _POSIX_MQUEUE_H_ +#define _POSIX_MQUEUE_H_ + +/*========================================================================== + * FILE: mqueue.h + * + * SERVICES: POSIX Message Queue API interface + * + * DESCRIPTION: POSIX Message Queue API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technlogies, Inc. + *==========================================================================*/ + +#include /*ssize_t */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MQ_PRIO_MAX 255 /* max priority */ +#define MQ_PRIO_DEFAULT 0 /* default priority */ + +typedef int mqd_t; + +struct mq_attr +{ + long mq_flags; /* message queue flags */ + long mq_maxmsg; /* maximum number of messages */ + long mq_msgsize; /* maximum message size */ + long mq_curmsgs; /* number of messages currently queued */ +}; + +typedef struct mq_attr mqueue_attr; + +/** \details + * This provides POSIX Message Queue API. + * + * mq_notify is not supported. + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * it only supports Message sending and receiving within one process. + * Message sending and receiving among processes are not supported. + */ + +/** \defgroup mqueue POSIX Message Queue API */ +/** \ingroup mqueue */ +/** @{ */ + +/** Open a message queue. + * Please refer to POSIX standard for details. + */ +mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...); + +/** Close a message queue. + * Please refer to POSIX standard for details. + */ +int mq_close(mqd_t mq_desc); + +/** Remove a message queue. + * Please refer to POSIX standard for details. + */ +int mq_unlink(const char *name); + +/** Send a message to a message queue. + * Please refer to POSIX standard for details. + * + * If the queue is full, instead of blocking the sender, this function + * will return -1 with errno EAGAIN, in this implementation. This behavior + * may change in the future. + */ +int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio); + +/** Send a message to a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); + +/** Receive a message from a message queue. + * Please refer to POSIX standard for details. + */ +ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio); + +/** Receive a message from a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout); + +/** Get message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat); + +/** Set message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat); + +/** @} */ + +#define NBBY 8U /* number of bits in a byte */ + +/* + * Select uses bit masks of file descriptors in longs. These macros + * manipulate such bit fields (the filesystem macros use chars). + * FD_SETSIZE may be defined by the user, but the default here should + * be enough for most uses. + */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 256U +#endif + +typedef unsigned long fd_mask; +#define NFDBITS (sizeof(fd_mask) * (unsigned int)NBBY) /* bits per mask */ + +#ifndef howmany +#define howmany(x, y) (((x) + ((y) - 1U)) / (y)) +#endif + +//equivalent of fd_set fpr WINNT env +typedef struct fd_set +{ + fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)]; +} fd_set; + +/** \addtogroup mqueue */ +/** @{ */ + +/** Sets the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Clears the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise. + */ +#define FD_ISSET(n, p) ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS))) + +/** Copies the file descriptor set. + */ +#define FD_COPY(f, t) (void)(memcpy)((t), (f), sizeof(*(f))) + +/** Initializes the file descriptor set fdset to have zero bits for all file descriptors. + */ +#define FD_ZERO(p) (void)memset((p), 0, sizeof(*(p))) + +/** Error check the file descriptor set. + */ +#define FD_BAD(fd) ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/) + +/*! Wait for both message queues and signals. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int pselect(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + const struct timespec *restrict timeout, + const sigset_t *restrict sigmask); + +/*! Wait for multiple message queues. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int select(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + struct timeval *restrict timeout); + +/** @} */ + +/* this function is needed for test framework which needs to clean up memory when teardown */ +void _mq_teardown(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread.h new file mode 100755 index 0000000000000..f64242e8dc683 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread.h @@ -0,0 +1,287 @@ +#ifndef QURT_PTHREAD_H +#define QURT_PTHREAD_H + +/*========================================================================== + * FILE: pthread.h + * + * SERVICES: POSIX pthread API interface + * + * DESCRIPTION: POSIX pthread API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016,2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *========================================================================== + * + * EDIT HISTORY FOR MODULE + * + * This section contains comments describing changes made to the module. + * Notice that changes are listed in reverse chronological order. + * + * + * + * when who what, where, why + * -------- --- ------------------------------------------------------- + * 10/13/08 cz Initial version. + *==========================================================================*/ + +#include +#include "sys/sched.h" /* For struct sched_param */ +#include "sys/errno.h" /* error values */ +#include +#include +#include +#include +#include +#include "pthread_types.h" +#ifdef __cplusplus +extern "C" { +#endif + +/* the range of the set supported by the kernel data type used to represent CPU sets. */ +#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL + +#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS) static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); } + +/** @brief Magic (non-portable) value for a stack's address to enable usage + of auto-stack feature (if available) */ +#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF) + +/** \details + * This provides POSIX thread API. + * + */ + +/** \defgroup pthread POSIX pthread API */ +/** \ingroup pthread */ +/** @{ */ + +/** Compare Two Threads. + * Please refer to POSIX standard for details. + */ +static inline int pthread_equal(pthread_t t1, pthread_t t2) +{ + return (t1 == t2) ? 1 : 0; +} + +/** Create Thread. + * Please refer to POSIX standard for details. + */ +int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg); + +/** Terminate Calling Thread. + * Please refer to POSIX standard for details. + */ +void pthread_exit(void *value_ptr); + +/** Wait for thread termination. + * Please refer to POSIX standard for details. + * @param thread [in] the thread to be joined + * @param value_ptr [out] the pointer of the exit status + */ +int pthread_join(pthread_t thread, void **value_ptr); + +/** Detach a joinable thread. + * Please refer to POSIX standard for details. + * @param id [in] id of the tread the thread to be detached. + */ +int pthread_detach(pthread_t id); + +/** Dynamic package initialisation + * Please refer to POSIX standard for details. + */ +int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)); + +pthread_t pthread_self(void); +int pthread_cancel(pthread_t thread); +static inline void pthread_yield(void) +{ + return; +} + +int pthread_kill(pthread_t thread, int sig); + +/** + * @brief Return name of thread + * @warning Donot call this in the error handling path as it may cause deadlock + * due to underlying OS calls + * @param thread [in] thread Thread whose name is to be retrieved + * @param name [out] name Buffer used to return thread name + * @param len [in] len Number of bytes available in name + * @return 0 on success, ESRCH, ERANGE on failure + */ +extern int pthread_getname_np (pthread_t thread, char * name, size_t len); + +int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param); +int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param); +int pthread_setschedprio(pthread_t thread, int prio); +int pthread_setcancelstate(int state, int *oldstate); +int pthread_setcanceltype(int type, int *oldtype); + +/* Attribute functions */ +int pthread_attr_init(pthread_attr_t *attr); +int pthread_attr_destroy(pthread_attr_t *attr); +int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param); +int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param); +int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize); +int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize); +int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr); +int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr); +int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate); +int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate); +int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize); +int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize); +int pthread_attr_setscope(pthread_attr_t *attr, int scope); +int pthread_attr_getscope(const pthread_attr_t *attr, int *scope); +int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched); +int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched); +int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize); +int pthread_attr_setautostack(pthread_attr_t *attr); +int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority); + +/* Qualcomm additions to pthread get/set attribute functions */ +int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name); +int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size); +int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid); +int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid); + +/* Mutexes */ +int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr); +int pthread_mutex_lock(pthread_mutex_t *mutex); +int pthread_mutex_unlock(pthread_mutex_t *mutex); +int pthread_mutex_trylock(pthread_mutex_t *mutex); +int pthread_mutex_destroy(pthread_mutex_t *mutex); +int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling); +int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling); + +/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not + * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support + * this kind of Mutex */ +int pthread_mutexattr_init(pthread_mutexattr_t *attr); +int pthread_mutexattr_destroy(pthread_mutexattr_t *attr); +int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type); +int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol); +int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int); +int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling); +int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling); + +/* Spinlocks */ +int pthread_spin_init(pthread_spinlock_t *lock, int pshared); +int pthread_spin_destroy(pthread_spinlock_t *lock); +int pthread_spin_lock(pthread_spinlock_t *lock); +int pthread_spin_trylock(pthread_spinlock_t *lock); +int pthread_spin_unlock(pthread_spinlock_t *lock); + +/* Condition variables */ +int pthread_condattr_init(pthread_condattr_t *attr); +int pthread_condattr_destroy(pthread_condattr_t *attr); +int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared); +int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared); +int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock); +int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock); +int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr); +int pthread_cond_destroy(pthread_cond_t *cond); +int pthread_cond_signal(pthread_cond_t *cond); +int pthread_cond_broadcast(pthread_cond_t *cond); +int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); +int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time); + +/* Barriers */ +int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count); +int pthread_barrier_destroy(pthread_barrier_t *barrier); +int pthread_barrier_wait(pthread_barrier_t *barrier); +int pthread_barrierattr_init(pthread_barrierattr_t *attr); +int pthread_barrierattr_destroy(pthread_barrierattr_t *attr); +int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared); + + +/*Read-Write locks*/ +int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *); +int pthread_rwlock_destroy(pthread_rwlock_t *); +int pthread_rwlockattr_init(pthread_rwlockattr_t *); +int pthread_rwlockattr_destroy(pthread_rwlockattr_t *); +int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *); +int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int); +int pthread_rwlock_rdlock(pthread_rwlock_t *); +int pthread_rwlock_tryrdlock(pthread_rwlock_t *); +int pthread_rwlock_wrlock(pthread_rwlock_t *); +int pthread_rwlock_trywrlock(pthread_rwlock_t *); +int pthread_rwlock_unlock(pthread_rwlock_t *); + + +/** please refer to POSIX standard document + */ +int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared); + +/** set CPU affinity attribute in thread attributes object. + + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [in] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpuset specified a CPU that was outside the set supported + by the kernel. (The kernel configuration option + CONFIG_NR_CPUS defines the range of the set supported by + the kernel data type used to represent CPU sets.) + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset); + +/** get CPU affinity attribute in thread attributes object. + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [out] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpusetsize is smaller than the size of the affinity mask + used by the kernel. + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset); + +/* TLS */ +int pthread_key_create(pthread_key_t *key, void (*destructor)(void*)); +int pthread_key_delete(pthread_key_t key); +int pthread_setspecific(pthread_key_t key, const void *value); +void *pthread_getspecific(pthread_key_t key); +int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); + +/** @} */ + +/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */ +int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr); +int pthread_fake_destroy(pthread_t thread); + +//amitkulk: move these to unistd.h after we move that header within qurt +int posix_memalign(void **memptr, size_t alignment, size_t size); +void exit(int status); +#ifdef __cplusplus +} +#endif + +#endif /* QURT_PTHREAD_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread_types.h new file mode 100755 index 0000000000000..51c3b9dbca243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread_types.h @@ -0,0 +1,193 @@ +#ifndef _PTHREAD_TYPES_H_ +#define _PTHREAD_TYPES_H_ + +/*========================================================================== + * FILE: pthread_types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2016, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __GNUC__ +#define restrict __restrict__ +#else +#define restrict +#endif + +#define _SSIZE_T + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#define PTHREAD_MAX_THREADS 512U + +#define PTHREAD_NAME_LEN 16 +#define PTHREAD_MIN_STACKSIZE 512 //4096 +#define PTHREAD_MAX_STACKSIZE 1048576 +#define PTHREAD_DEFAULT_STACKSIZE 16384 + +#define PTHREAD_STACK_MIN (4096U*2U) +#define PTHREAD_MIN_PRIORITY 0U +#define PTHREAD_MAX_PRIORITY 255U +#define PTHREAD_DEFAULT_PRIORITY 1 + +/*Mutex initialization status*/ +#define PTHREAD_MUTEX_ATTR_UNINITIALIZED 0 +#define PTHREAD_MUTEX_ATTR_INITIALIZED 1 + +/*Conditional attributes initialization status*/ +#define PTHREAD_COND_ATTR_UNINITIALIZED 0 +#define PTHREAD_COND_ATTR_INITIALIZED 1 + +#define PTHREAD_DEFAULT_NAME "Anonymous" + +#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t) 0xFFFFFFFFU) + +#define PTHREAD_COND_INITIALIZER ((pthread_cond_t) 0xFFFFFFFFU) + +/* mutex and cond_var shared */ +#define PTHREAD_PROCESS_PRIVATE 0 +#define PTHREAD_PROCESS_SHARED 1 + +/* mutex type */ +#define PTHREAD_MUTEX_ERRORCHECK 0 +#define PTHREAD_MUTEX_NORMAL 1 +#define PTHREAD_MUTEX_RECURSIVE 2 +#define PTHREAD_MUTEX_DEFAULT 3 + +/* mutex protocol */ +#define PTHREAD_PRIO_NONE 0 +#define PTHREAD_PRIO_INHERIT 1 +#define PTHREAD_PRIO_PROTECT 2 + +#define PTHREAD_SPINLOCK_UNLOCKED 0 +#define PTHREAD_SPINLOCK_LOCKED 1 + +#define PTHREAD_ONCE_INIT (0) + +#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug + +typedef signed int ssize_t; + +/*detatchstate of a pthread*/ +#define PTHREAD_CREATE_JOINABLE 1 +#define PTHREAD_CREATE_DETACHED 0 + +/*contention scope*/ +#define PTHREAD_SCOPE_PROCESS 1 +#define PTHREAD_SCOPE_SYSTEM 0 + +/*scheduler*/ +#define PTHREAD_INHERIT_SCHED 1 +#define PTHREAD_EXPLICIT_SCHED 0 + +/* + * Types and structure definitions + * + */ +typedef unsigned int cpu_set_t; + +typedef unsigned int pthread_t; + +typedef struct pthread_attr_t +{ + void *stackaddr; + int internal_stack; /* this flag==1 means the stack needs to be freed by posix */ + size_t stacksize; + int priority; + unsigned short timetest_id; + /* This flag indicate if thread will be autostack thread*/ + unsigned short autostack:1; + /* This flag is to indicate thread's bus_priority high/low + bus_priority = 0 -- Bus_priority is low + bus_priority = 1 -- Bus_priority is high + bus_priority = 3 -- Bus_priority is default (takes the default set for the process) + */ + unsigned short bus_priority:2; + unsigned short reserved:13; + cpu_set_t cpumask; + char name[PTHREAD_NAME_LEN]; + /* This flag indicates whether pthread lib should create thread contexts for other OSALs */ + /* This is used internally by POSIX and not available for general usage */ + int ext_context; + int detachstate; +} pthread_attr_t; + +//mutex attr +typedef struct pthread_mutexattr_t pthread_mutexattr_t; +struct pthread_mutexattr_t +{ + int is_initialized; + int type; + int pshared; + int protocol; +}; + +typedef unsigned int pthread_mutex_t; + +typedef unsigned int pthread_spinlock_t; + +typedef struct pthread_condattr_t +{ + int is_initialized; + int pshared; + clockid_t clock_id; +} pthread_condattr_t; + +typedef unsigned int pthread_cond_t; + +typedef struct pthread_barrierattr_t +{ + int is_initialized; + int pshared; +} pthread_barrierattr_t; + +typedef unsigned int pthread_barrier_t; + +typedef int pthread_key_t; + +typedef int pthread_once_t; + + +/*Read-Write locks*/ +#define PTW32_RWLOCK_MAGIC 0xfacade2 +#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1) + +struct pthread_rwlockattr_t_ +{ + int pshared; +}; + +struct pthread_rwlock_t_ +{ + pthread_mutex_t mtxExclusiveAccess; + pthread_mutex_t mtxSharedAccessCompleted; + pthread_cond_t cndSharedAccessCompleted; + int nSharedAccessCount; + int nExclusiveAccessCount; + int nCompletedSharedAccessCount; + int nMagic; +}; + +typedef struct pthread_rwlock_t_ * pthread_rwlock_t; +typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t; +#ifdef __cplusplus +} +#endif + +#endif /* _PTHERAD_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sched.h new file mode 100755 index 0000000000000..faf3365be9f82 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sched.h @@ -0,0 +1,21 @@ +/*============================================================================= + + sched.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SCHED_H__ +#define __SCHED_H__ + +#include "sys/sched.h" + +#endif //__SCHED_H__ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/semaphore.h new file mode 100755 index 0000000000000..d9145b295ae62 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/semaphore.h @@ -0,0 +1,114 @@ +#ifndef SEMAPHORE_H +#define SEMAPHORE_H + +/*========================================================================== + * FILE: semaphore.h + * + * SERVICES: POSIX semaphore API interface + * + * DESCRIPTION: POSIX semaphore API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ +#include // Get all C sys types - includes POSIX specific +#include "sys/errno.h" // error values + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** User facing semaphore container with opaque pointer to implementation */ +typedef struct +{ + unsigned int *opaque; +} sem_t; +#define _SEM_T + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* constant definitions */ +#define SEM_FAILED ((sem_t*) 0) + +/* @todo siqbal Should we put such configuration items in a common place + instead of this user-facing header? */ +#define SEM_VALUE_MAX ((unsigned int) 30) // If need be increase this + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/** \details + * POSIX standard comes with two kinds of semaphores: named and unnamed + * semaphores. + * + * This implementation of POSIX kernel API provide unnamed & named semaphore. + * + * + * sem_timedwait() is not provided. + */ + +/** \defgroup semaphore POSIX Semaphore API */ + +/** \ingroup semaphore */ +/** @{ */ + +/** Initialize an unnamed semaphore. + * Please refer to POSIX standard for details. + * @param pshared [in] This implementation does not support non-zero value, + * i.e., semaphore cannot be shared between processes in this implementation. + */ +int sem_init(sem_t *sem, int pshared, unsigned int value); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_wait(sem_t *sem); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_trywait(sem_t *sem); + +/** Unlock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_post(sem_t *sem); + +/** Get the value of a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_getvalue(sem_t *sem, int *value); + +/** Destroy an unnamed semaphore. + * Please refer to POSIX standard for details. + */ +int sem_destroy(sem_t *sem); + +/** creates and initializes a named semaphore. + * Please refer to POSIX standard for details. + */ +sem_t * sem_open(const char* name , int oflag , ...); + +/** closes a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_close(sem_t *sem); + +/** unlinkes a named semaphore. + * Please refer to POSIX standard for details. + */ +int sem_unlink(const char *name); +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* SEMAPHORE_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/signal.h new file mode 100755 index 0000000000000..35cb1f1a9a319 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/signal.h @@ -0,0 +1,201 @@ +#ifndef _SIGNAL_H_ +#define _SIGNAL_H_ + +/*========================================================================== + * FILE: signal.h + * + * SERVICES: POSIX Signal API interface + * + * DESCRIPTION: POSIX Signal API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* POSIX signal bits */ + +#define POSIX_MSG 7 /* POSIX msg type used in Qube API */ +#define POSIX_NOTIF 8 /* POSIX msg type used in Qube API */ +#define SIGKILL 9 /* kill (cannot be caught or ignored) */ + +#define SIGRTMIN 10 +#define SIGRTMAX 32 + +/* Notification Types. */ +/* No asynchronous notification is delivered when the event of interest occurs. */ +#define SIGEV_NONE 0 +/* The signal specified in sigev_signo shall be generated for the process when + the event of interest occurs. */ +#define SIGEV_SIGNAL 1 +/* A notification function is called to perform notification. */ +#define SIGEV_THREAD 2 +#define SA_SIGINFO 1 + +/* + * Flags for sigprocmask: + */ +#define SIG_BLOCK 1 /* block specified signal set */ +#define SIG_UNBLOCK 2 /* unblock specified signal set */ +#define SIG_SETMASK 3 /* set specified signal set */ + +typedef unsigned long int sigset_t; + +union sigval +{ + int sival_int; /* Integer signal value. */ + void *sival_ptr; /* Pointer signal value. */ +}; + +typedef struct sigevent sigevent; +struct sigevent +{ + int sigev_notify; /* Notification type. */ + int sigev_signo; /* Signal number. */ + union sigval sigev_value; /* Signal value. */ + void (*sigev_notify_function)(union sigval); /* Notification function. */ + pthread_attr_t *sigev_notify_attributes; +}; + +typedef struct siginfo_t siginfo_t; +struct siginfo_t +{ + int si_signo; + int si_code; + union sigval si_value; +/* int si_errno; + pid_t si_pid; + uid_t si_uid; + void *si_addr; + int si_status; + long si_band;*/ +}; +struct sigaction +{ + void (*sa_handler)(int); + sigset_t sa_mask; + int sa_flags; + void (*sa_sigaction)(int, siginfo_t *, void *); +}; + +/* Signal functions */ + +/** \details + * This provides POSIX Signal API. Please note that this + * implementation does not fully comply with POSIX standard. + * + * In POSIX standard, Signal can be used as 'interrupt', which means + * an incoming signal will interrupt a running thread. After the + * registered signal handler is executed, the thread will resume. + * This behavior cannot be implemented w/o modifying L4 or QURT kernel. + * On the ohter hand, appliation need to be carefully written to avoid + * problems caused by 'interrupting' signals. + * + * Therefore, in this implementation of POSIX signal, thread will + * only receive signals when it explicitly waits for signals, i.e., when + * the thread calls either sigwait() or sigsuspend(). + * + * Therefore, pthread_sigmask(), which set or get signal mask for a thread, + * is not supported, since the signal mask will be set by sigwait() and + * sigsuspend(). + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * only threads can send and receive signals. The functions related to + * signal operations with processes, such as kill(), sigqueue(), + * sigprocmask(), are not provided. + * + * Queued signal is not supported. + * + * Applications will use signals from SIGRTMIN to SIGRTMAX. + * + * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not + * supported. + * + */ + +/** \defgroup signal POSIX Signal API */ +/** \ingroup signal */ +/** @{ */ + +/** Wait for signals. This implementation does not support queued signals. + * + * Please refer to POSIX standard for details. + */ +int sigwait(const sigset_t *restrict set, int *restrict sig); + +/** Examine and Change Signal Action. + * Please refer to POSIX standard for details. + * + * @param act [in] A pointer to the sigaction structure that describes the + * action to be taken for the signal. Can be NULL. + * The following flags for sa_flags field in struct sigaction are not + * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, + * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported. + * + * @note Define sigaction as macro to avoid a warning when included from + * C++ code - it's causing a "sigaction(...) hides constructor for + * 'struct sigaction'" warning. + */ +/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */ +#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact)) + +/** Wait for signals. + * Please refer to POSIX standard for details. + */ +int sigsuspend(const sigset_t *sigmask); + +/** Add Signal to Signal Set. + * Please refer to POSIX standard for details. + */ +int sigaddset(sigset_t *set, int signo); + +/** Delete Signal from Signal Set. + * Please refer to POSIX standard for details. + */ +int sigdelset(sigset_t *set, int signo); + +/** Initialize and Empty Signal Set. + * Please refer to POSIX standard for details. + */ +int sigemptyset(sigset_t *set); + +/** Initialize and Fill Signal Set. + * Please refer to POSIX standard for details. + */ +int sigfillset(sigset_t *set); + +/** Test for Signal in Signal Set. + * Please refer to POSIX standard for details. + */ +int sigismember(const sigset_t *set, int signo); + +/** @} */ + +/* this is not a public api function */ +int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact); + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +/** Wait for the time interval specified in the timespec structure referenced + * by timeout. This implementation does not support queued signals. + * For struct siginfo_t, si_code and si_value are ignored in this implementation. + * + * Please refer to POSIX standard for details. + */ +int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, + const struct timespec *restrict timeout); + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SIGNAL_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/errno.h new file mode 100755 index 0000000000000..b9edf57bab6c3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/errno.h @@ -0,0 +1,20 @@ +#ifndef _SYS_ERRNO_H_ +#define _SYS_ERRNO_H_ + +/*========================================================================== + * FILE: errno.h + * + * SERVICES: POSIX errno header file + * + * DESCRIPTION: POSIX errno based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#ifndef EOK +#define EOK 0 +#endif + +#endif /* _SYS_ERRNO_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/sched.h new file mode 100755 index 0000000000000..2acc34d821725 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/sched.h @@ -0,0 +1,67 @@ +#ifndef _POSIX_SCHED_H_ +#define _POSIX_SCHED_H_ + +/*========================================================================== + * FILE: sched.c + * + * SERVICES: POSIX Thread sched API interface + * + * DESCRIPTION: POSIX Thread sched API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCHED_FIFO 0 /* First in, first out (FIFO) scheduling policy. */ +#define SCHED_RR 1 /* Round robin scheduling policy. */ +#define SCHED_SPORADIC 2 /* Sporadic server scheduling policy. */ +#define SCHED_OTHER 3 /* Another scheduling policy. */ + +typedef struct sched_param sched_param; +struct sched_param +{ + void *unimplemented; + int sched_priority; +}; + +/** \details + * This provides POSIX sched API. + */ + +/** \defgroup sched POSIX sched API */ +/** \ingroup sched */ +/** @{ */ + +/** Relinquish the CPU. + * Please refer to POSIX standard for details. + */ +static inline int sched_yield(void) +{ + return 0; +} + +/** Get the maximum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_max(int policy); + +/** Get the minimum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_min(int policy); + +/** @} */ +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SCHED_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/types.h new file mode 100755 index 0000000000000..700026f9f9e4e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/types.h @@ -0,0 +1,35 @@ +#ifndef _SYS_TYPES_H_ +#define _SYS_TYPES_H_ + +/*========================================================================== + * FILE: types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#if !defined( _PID_T ) || !defined( __pid_t_defined ) +/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header + defines it as unsigned 32-bit type citing conflict with QuRT POSIX + compatibility later. If any such conflicts exist, we should fix them. + pid_t is being defined *BEFORE* inclusion of generic/sys/types.h + *INTENTIONALLY* to fix this */ +typedef int pid_t; +#define _PID_T +#define __pid_t_defined +#endif +#include +#include +#include +#include + +#ifndef __DEFINED_off_t +typedef long off_t; +#define __DEFINED_off_t +#endif + +#endif /* _SYS_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/time.h new file mode 100755 index 0000000000000..13aeb1ea9920d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/time.h @@ -0,0 +1,142 @@ +#ifndef _POSIX_TIME_H_ +#define _POSIX_TIME_H_ + +/*========================================================================== + * FILE: time.h + * + * SERVICES: POSIX Timer API interface + * + * DESCRIPTION: POSIX Timer API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *==========================================================================*/ + + +#include + +typedef int clockid_t; /* ignored */ +#define _CLOCKID_T +#define _PROVIDE_POSIX_TIME_DECLS 1 +#include +/* @todo anandj sys/time.h has definition for struct timeval but is not + included by generic/time.h */ +#include + +#define CLOCK_FREQ_NOT_DEFINED -1 +/* Frequency of Sclk used */ +#define TIME_CONV_SCLK_FREQ 19200000 + +#define RES_CONV_FACTOR1 1 +#define RES_CONV_FACTOR2 1000000000 + +#if !defined(CLOCK_REALTIME) +# define CLOCK_REALTIME 0 +#endif + +#if !defined(CLOCK_MONOTONIC) +# define CLOCK_MONOTONIC 1 +#endif + +#if !defined(CLOCK_THREAD_CPUTIME_ID) +# define CLOCK_THREAD_CPUTIME_ID 2 +#endif + +#if !defined(CLOCK_PROCESS_CPUTIME_ID) +# define CLOCK_PROCESS_CPUTIME_ID 3 +#endif + +#if !defined(CLOCK_MONOTONIC_RAW) +# define CLOCK_MONOTONIC_RAW 4 +#endif + +#if !defined(CLOCK_REALTIME_COARSE) +# define CLOCK_REALTIME_COARSE 5 +#endif + +#if !defined(CLOCK_MONOTONIC_COARSE) +# define CLOCK_MONOTONIC_COARSE 6 +#endif + +#if !defined(CLOCK_BOOTTIME) +# define CLOCK_BOOTTIME 7 +#endif + +struct itimerspec +{ + struct timespec it_interval; /* Timer period. */ + struct timespec it_value; /* Timer expiration. */ +}; + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Timer functions */ + +/** \details + * POSIX timers can be either of two types: a one-shot type or a periodic + * type. + * + * A one-shot is an armed timer that is set to an expiration time relative + * to either a current time or an absolute time. The timer expires once and + * is disarmed. + * + * A periodic timer is armed with an initial expiration time and a repetition + * interval. Every time the interval timer + * expires, the timer is reloaded with the repetition interval. The timer + * is then rearmed. + */ + +/** \defgroup timer POSIX Timer API */ + +/** \ingroup timer */ +/** @{ */ + +/** Create a POSIX timer. + * Please refer to POSIX standard for details. + * @param clockid [in] ignored in this implementation + * @param evp [in] if non-NULL, points to a sigevent structure. This + * structure, allocated by the application, defines the asynchronous + * notification to occur when the timer expires. If the evp argument is + * NULL, the effect is as if the evp argument pointed to a sigevent + * structure with the sigev_notify member having the value SIGEV_SIGNAL, + * the sigev_signo having a default signal number (SIGALRM), and the + * sigev_value member having the value of the timer ID. + */ +int timer_create(clockid_t clockid, struct sigevent *restrict evp, + timer_t *restrict timerid); + +/** Delete a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_delete(timer_t timerid); + +/** Get the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_gettime(timer_t timerid, struct itimerspec *value); + + +/** Set the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + * @param flags [in] ignored in this implementation + */ +int timer_settime(timer_t timerid, int flags, + const struct itimerspec *restrict value, + struct itimerspec *restrict ovalue); +/** Obtain ID of a process CPU-time clock + * @param pid [in] Process ID + * @param clock_id [out] Clock ID + * @return Error values as per POSIX standard + */ +int clock_getcpuclockid (pid_t pid, clockid_t * clock_id); +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_TIME_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qube/qube.h new file mode 100755 index 0000000000000..1e31e2deedb38 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qube/qube.h @@ -0,0 +1,51 @@ +#ifndef QUBE_H +#define QUBE_H +/*============================================================================= + + qube.h -- H E A D E R F I L E + +GENERAL DESCRIPTION + Prototypes of qpd API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Define Error codes as QuRT error codes preceed with QURT_ */ +#ifndef EOK +#define EOK QURT_EOK +#endif /* EOK */ +#ifndef EVAL +#define EVAL QURT_EVAL +#endif /* EVAL */ +#ifndef EMEM +#define EMEM QURT_EMEM +#endif /* EMEM */ +#ifndef EINVALID +#define EINVALID QURT_EINVALID +#endif /* EINVALID */ + + +/*============================================================================= + FUNCTION DECLARATIONS +=============================================================================*/ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QUBE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops.h new file mode 100755 index 0000000000000..0a9a9f8ba7db5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops.h @@ -0,0 +1,197 @@ +#ifndef ATOMIC_OPS_H +#define ATOMIC_OPS_H +/** + @file atomic_ops.h + + @brief Type definitions backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * Author: Carlos Dyonisio + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned int atomic_plain_word_t; + +/*-------------------------------------------------------------------------*/ + /* Atomic Ops API. */ + +/* + * IMPORTANT! + * If you plan to change the structure atomic_word_t, please add the new + * elements after value. For more information, read the comment in + * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66 + */ + +typedef struct { + volatile atomic_plain_word_t value; +} atomic_word_t; + +#define ATOMIC_INIT(i) { (i) } + +static inline void +atomic_init(atomic_word_t *a, atomic_plain_word_t v) +{ + a->value = v; +} + +#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \ + (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP)) + +/* + * If it is ARMv4/v5, the function declarations may change + * and are defined in the arch specific header file, + * as some of then cannot be declared static because of + * the assembler implementation. + */ + +#else + +/* Arithmetic operations. */ + +void atomic_sub(atomic_word_t *target, atomic_plain_word_t v); + +/* Architecture independent definitions. */ + +static inline atomic_plain_word_t atomic_read(atomic_word_t *target) +{ + return target->value; +} + +typedef unsigned long long atomic64_plain_word_t; + +typedef struct { + volatile atomic64_plain_word_t value; +} atomic64_word_t; + +static inline void +atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v) +{ + a->value = v; +} + +/********************* + Support 64-bit + *********************/ + +atomic64_plain_word_t atomic64_set(atomic64_word_t* target, + atomic64_plain_word_t value); + +void atomic64_xor(atomic64_word_t* target, + atomic64_plain_word_t mask); + +/*---------------------------------------------------------------------------*/ + +/* Architecture independent definitions. */ + +static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target) +{ + return target->value; +} + +#endif + + +/* Architecture dependent definitions. */ +#include + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops_plat.h new file mode 100755 index 0000000000000..b54b3ff83d978 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops_plat.h @@ -0,0 +1,86 @@ +#ifndef ATOMIC_OPS_PLAT_H +#define ATOMIC_OPS_PLAT_H +/** + @file atomic_ops_plat.h + + @brief Prototypes of atomic operations API backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define atomic_set(a,b) qurt_atomic_set((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and(a,b) qurt_atomic_and((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and_return(a,b) qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or(a,b) qurt_atomic_or((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or_return(a,b) qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor(a,b) qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor_return(a,b) qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_set_bit(a,b) qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_clear_bit(a,b) qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_change_bit(a,b) qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add(a,b) qurt_atomic_add((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_return(a,b) qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_unless(a,b,c) qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_sub(a,b) qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b)) +#define atomic_sub_return(a,b) qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_inc(a) qurt_atomic_inc((unsigned int *)(a)) +#define atomic_inc_return(a) qurt_atomic_inc_return((unsigned int *)(a)) +#define atomic_dec(a) qurt_atomic_dec((unsigned int *)(a)) +#define atomic_dec_return(a) qurt_atomic_dec_return((unsigned int *)(a)) +#define atomic_compare_and_set(a,b,c) qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_barrier qurt_atomic_barrier +#define atomic_barrier_write qurt_atomic_barrier_write +#define atomic_barrier_write_smp qurt_atomic_barrier_write_smp +#define atomic_barrier_read_smp qurt_atomic_barrier_read_smp +#define atomic_barrier_smp qurt_atomic_barrier_smp + +/*============================ + * 64 bits support + *============================ */ +#define atomic64_set(a,b) qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and(a,b) qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and_return(a,b) qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or(a,b) qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or_return(a,b) qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor(a,b) qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor_return(a,b) qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_set_bit(a,b) qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_clear_bit(a,b) qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_change_bit(a,b) qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add(a,b) qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add_return(a,b) qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub(a,b) qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub_return(a,b) qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_inc(a) qurt_atomic64_inc((unsigned long long *)(a)) +#define atomic64_inc_return(a) qurt_atomic64_inc_return((unsigned long long *)(a)) +#define atomic64_dec(a) qurt_atomic64_dec((unsigned long long *)(a)) +#define atomic64_dec_return(a) qurt_atomic64_dec_return((unsigned long long *)(a)) +#define atomic64_compare_and_set(a,b,c) qurt_atomic64_compare_and_set((unsigned long long *)(a),(unsigned long long )(b),(unsigned long long )(c)) +#define atomic64_barrier qurt_atomic64_barrier +#define atomic64_barrier_write qurt_atomic64_barrier_write +#define atomic64_barrier_write_smp qurt_atomic64_barrier_write_smp +#define atomic64_barrier_read_smp qurt_atomic64_barrier_read_smp +#define atomic64_barrier_smp qurt_atomic64_barrier_smp + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_PLAT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt.h new file mode 100755 index 0000000000000..4d25c9b2b6243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt.h @@ -0,0 +1,111 @@ +#ifndef QURT_H +#define QURT_H + +/** + @file qurt.h + @brief Contains kernel header files that provide kernel OS API functions, constants, and + definitions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013,2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +/*====================================================================== + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Notice that changes are listed in reverse chronological + * order. + * + * + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------ + * 2011-02-25 op Add Header file + 2012-12-16 cm (Tech Pubs) Edited/added Doxygen comments and markup. + ======================================================================*/ + + +#ifdef __cplusplus +extern "C" { +#endif + +#include "qurt_consts.h" +#include "qurt_api_version.h" +#include "qurt_alloc.h" +#include "qurt_futex.h" +#include "qurt_mutex.h" +#include "qurt_pipe.h" +#include "qurt_printf.h" +#include "qurt_assert.h" +#include "qurt_thread.h" +#include "qurt_trace.h" +#include "qurt_cycles.h" +#include "qurt_profile.h" +#include "qurt_sem.h" +#include "qurt_cond.h" +#include "qurt_barrier.h" +#include "qurt_fastint.h" +#include "qurt_allsignal.h" +#include "qurt_anysignal.h" +#include "qurt_signal.h" +#include "qurt_rmutex.h" +#include "qurt_pimutex.h" +#include "qurt_signal2.h" +#include "qurt_rmutex2.h" +#include "qurt_pimutex2.h" +#include "qurt_int.h" +#include "qurt_lifo.h" +#include "qurt_power.h" +#include "qurt_event.h" +#include "qurt_pmu.h" +#include "qurt_stid.h" +//#include "qurt_version.h" +#include "qurt_tlb.h" +#include "qurt_vtlb.h" +#include "qurt_memory.h" +#include "qurt_qdi.h" +#include "qurt_sclk.h" +#include "qurt_space.h" +#include "qurt_process.h" +#include "qurt_timer.h" +#include "qurt_tls.h" +#include "qurt_thread_context.h" +#include "qurt_hvx.h" +#include "qurt_hmx.h" +#include "qurt_mailbox.h" +#include "qurt_island.h" +#include "qurt_qdi_proxy.h" +#include "qurt_l2cfg.h" +#include "qurt_mmap.h" +#include "qurt_isr.h" +#include "qurt_busywait.h" +#include "qurt_ecc.h" +#include "qurt_callback.h" +#include "qurt_error.h" +#include "qurt_except.h" +#include "qurt_mq.h" +#include "qurt_user_dma.h" +#include "qurt_fs_hub.h" +#include "qurt_os_services.h" + +#ifndef MAIN_ONLY +#define INCLUDE_ISLAND_CONTENTS +#endif +#ifndef ISLAND_ONLY +#define INCLUDE_MAIN_CONTENTS +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_alloc.h new file mode 100755 index 0000000000000..da37a4c0a714e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_alloc.h @@ -0,0 +1,145 @@ +#ifndef QURT_ALLOC_H +#define QURT_ALLOC_H + +/** + @file qurt_alloc.h + @brief Prototypes of kernel memory allocation API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_malloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated memory area. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] size Size (in bytes) of the memory area. + + @return + Nonzero -- Pointer to the allocated memory area. \n + 0 -- Not enough memory in heap to allocate memory area. + + @dependencies + None. + + */ +/* ======================================================================*/ +void *qurt_malloc( unsigned int size); + +/*======================================================================*/ +/**@ingroup func_qurt_calloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated array. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] elsize Size (in bytes) of each array element. + @param[in] num Number of array elements. + + @return + Nonzero -- Pointer to allocated array.\n + Zero -- Not enough memory in heap to allocate array. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_calloc(unsigned int elsize, unsigned int num); + +/*======================================================================*/ +/**@ingroup func_qurt_realloc + Reallocates memory on the heap. \n + Changes the size of a memory area that is already allocated on the QuRT system heap. + The reallocate memory operation is functionally similar to realloc. It accepts a pointer + to an existing memory area on the heap, and resizes the memory area to the specified size + while preserving the original contents of the memory area. + + @note1hang This function might change the address of the memory area. + If the value of ptr is NULL, this function is equivalent to + qurt_malloc(). + If the value of new_size is 0, it is equivalent to qurt_free(). + If the memory area is expanded, the added memory is not initialized. + + @param[in] *ptr Pointer to the address of the memory area. + @param[in] newsize Size (in bytes) of the reallocated memory area. + + @return + Nonzero -- Pointer to reallocated memory area. \n + 0 -- Not enough memory in heap to reallocate the memory area. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_realloc(void *ptr, int newsize); + +/*======================================================================*/ +/**@ingroup func_qurt_free + Frees allocated memory from the heap.\n + Deallocates the specified memory from the QuRT system heap. + + @param[in] *ptr Pointer to the address of the memory to deallocate. + + @return + None. + + @dependencies + The memory item that the ptr value specifies must have been previously + allocated using one of the qurt_calloc(), + qurt_malloc(), or qurt_realloc() memory allocation functions. + Otherwise the behavior of QuRT is undefined. + + */ + /* ======================================================================*/ +void qurt_free( void *ptr); + + +void *qurt_memalign(unsigned int alignment, unsigned int size); + +/* +|| Macro to define a static heap for a QuRT program. +|| +|| Usage: +|| Declare at the top-level of any C source file that +|| is part of the build (and is guaranteed +|| to actually be pulled into the build). Place +|| it in the same function with main(): +|| +|| QURT_DECLARE_STATIC_HEAP(512000); +|| +|| The only argument is the size in bytes, and it is +|| rounded up to the nearest 64 bytes (size of an +|| L2 cache block). +|| +*/ + +#define QURT_DECLARE_STATIC_HEAP(sz) \ + static struct qurt_static_heap { \ + char space[(sz)] __attribute__((aligned(64))); \ + } static_heap[1]; \ + void * const override_heap_Base = &static_heap[0]; \ + void * const override_heap_Limit = &static_heap[1] + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLOC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_allsignal.h new file mode 100755 index 0000000000000..5dc89e495130d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_allsignal.h @@ -0,0 +1,176 @@ + +#ifndef QURT_ALLSIGNAL_H +#define QURT_ALLSIGNAL_H + +/** + @file qurt_allsignal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup all_signal_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** +qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int waiting; /**< */ + unsigned int signals_in; /**< */ + unsigned int queue; /**< */ + unsigned int reserved; /**< */ + }X; + /** @endcond */ +} qurt_allsignal_t; +/** @} */ /* end_addtogroup all_signal_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_init + Initializes an all-signal object.\n + The all-signal object is initially cleared. + + @datatypes + #qurt_allsignal_t + + @param[out] signal Pointer to the all-signal object to initialize. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_init(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_destroy + Destroys the specified all-signal object.\n + @note1hang All-signal objects must be destroyed when they are no longer in use. + Failure to do this causes resource leaks in the QuRT kernel. \n + @note1cont All-signal objects must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_destroy(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_get + Gets signal values from the all-signal object. + + Returns the current signal values of the specified all-signal object. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to access. + + @return + Bitmask with current signal values. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal) +{ return signal->X.signals_in; } + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_wait + Waits on the all-signal object.\n + Suspends the current thread until all of the specified signals are set. + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 that it is not to be waited on. + + If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal + object before waiting on them again -- clearing is done automatically by the wait + operation. + + @note1hang At most, one thread can wait on an all-signal object at any given time. + Because signal clearing is done by the wait operation, no clear operation is + defined for all-signals. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to wait on. + @param[in] mask Signal mask value, which identifies the individual signals in the all-signal object + to wait on. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_set + Set signals in the specified all-signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit + value of 1 indicates that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the all-signal object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLSIGNAL_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_anysignal.h new file mode 100755 index 0000000000000..9619e2de562b4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_anysignal.h @@ -0,0 +1,225 @@ +#ifndef QURT_ANYSIGNAL_H +#define QURT_ANYSIGNAL_H +/** + @file qurt_anysignal.h + Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +Copyright (c) 2021 Qualcomm Technologies, Inc. +All rights reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== +Typedefs +======================================================================*/ + +/**@ingroup anysignals_types + qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility. */ +typedef qurt_signal_t qurt_anysignal_t; + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_init + Initializes an any-signal object.\n + The any-signal object is initially cleared. + + @datatypes + #qurt_anysignal_t + + @param[out] signal Pointer to the initialized any-signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_init(qurt_anysignal_t *signal) +{ + qurt_signal_init(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_destroy + Destroys the specified any-signal object. + + @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Any-signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal) +{ + qurt_signal_destroy(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait + Wait on the any-signal object. \n + Suspends the current thread until any one of the specified signals is set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + + @return + Bitmask of current signal values. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_set + Sets signals in the specified any-signal object. \n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be set, and 0 indicates not to set the sigmal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the any-signal object. + + @return + Bitmask of old signal values (before set). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask); + + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_get + Gets signal values from the any-signal object.\n + Returns the current signal values of the specified any-signal object. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to access. + + @return + A bitmask with the current signal values of the specified any-signal object. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal) +{ + return qurt_signal_get(signal); +} + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_clear + @xreflabel{sec:anysignal_clear} + Clears signals in the specified any-signal object.\n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + clear in the any-signal object. + + @return + Bitmask -- Old signal values (before clear). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait_timed + Waits on the any-signal object. \n + Suspends the current thread until any of the specified signals is set or timeout expires. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + @param[out] signals Bitmask of current signal values. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- timeout + #QURT_EINVALID -- Duration out of range + + @dependencies + None. + */ +/* ======================================================================*/ + +int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ANYSIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_api_version.h new file mode 100755 index 0000000000000..dfe53ae755054 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_api_version.h @@ -0,0 +1,77 @@ +#ifndef QURT_API_VERSION_H +#define QURT_API_VERSION_H +/*============================================================================== + +qurt_api_version.h + +GENERAL DESCRIPTION + API version file + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +/*============================================================================== + CONSTANTS AND DEFINITIONS +==============================================================================*/ +/** + * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer. + * Main release has first 3 fields updated - Major, Minor and Release. + * - QURT_API_VERSION = Major, Minor, Release. + * Patch releases are supported by adding the extra field. + * - QURT_API_VERSION = Major, Minor, Release, Patch. + */ +// Major version is incremented for incompatible API changes. +#define QURT_API_VER_MAJOR 1 + +// Minor version is incremented for backward-compatible enhancements in the API +// set. +#define QURT_API_VER_MINOR 4 + +// RELEASE version is incremented for each release within a `MAJOR.MINOR` +// release. +#define QURT_API_VER_RELEASE 1 + +// Patch version is incremented when new API content is introduced on older LTS +// release. +#define QURT_API_VER_PATCH 0 + +/* Update the QURT_API_VERSION function macro. */ +#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \ + ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \ + (((release) & 0xFF) << 8) | ((patch) & 0xFF)) + +/* Update the QURT_API_VERSION Macro. */ +#define QURT_API_VERSION \ + QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \ + QURT_API_VER_RELEASE, QURT_API_VER_PATCH) + +/** Usage: + * + * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0) + * qurt_func_2(a,b,c); + * #else + * qurt_func(a); + * #endif + * + */ +/* + Gets the QuRT API version. + + @return + QuRT API version. + + @dependencies + None. + */ +unsigned int qurt_api_version(void); + +#endif /* QURT_API_VERSION_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_assert.h new file mode 100755 index 0000000000000..13cc2afd2e973 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_assert.h @@ -0,0 +1,51 @@ +#ifndef QURT_ASSERT_H +#define QURT_ASSERT_H +/** + @file qurt_assert.h + @brief Prototypes of qurt_assert API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@ingroup func_qurt_assert_error + Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel. + + @datatypes + None. + + @param[in] filename Pointer to the file name string. + @param[in] lineno Line number. + + @return + None. + + @dependencies + None. + */ +void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn)); + +#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__)) + +/** @} */ /* end_ingroup func_qurt_assert */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ASSERT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_atomic_ops.h new file mode 100755 index 0000000000000..d9b2cff7d737c --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_atomic_ops.h @@ -0,0 +1,1298 @@ +#ifndef QURT_ATOMIC_OPS_H +#define QURT_ATOMIC_OPS_H +/** + @file qurt_atomic_ops.h + @brief Prototypes of kernel atomic operations API. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * + * This file is only included by the main atomic_ops.h, so all of that + * file's definitions are available. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +///* Sanity check to ensure the smp flag is set in machines.py */ +//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1 +//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py. +//#endif +#define QURT_INLINE __attribute__((always_inline)) + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_atomic_set + Sets the atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value Value to set. + + @return + Value successfuly set. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_set(unsigned int* target, unsigned int value) +{ + unsigned long tmp; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " memw_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic_and + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + None + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_and(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_and_return + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + AND result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_and_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_or + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_or(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_or_return + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + Returns the OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_or_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_xor + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_xor(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_xor_return + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_xor_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_set_bit + Sets a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_set_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_clear_bit + Clears a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_clear_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_change_bit + Toggles a bit in a atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_change_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1fU; + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_add(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add_return + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_add_unless + Adds the delta value to an atomic variable unless the current value in the target + matches the unless variable. + + @note1hang The function retries until load lock and store conditional + are successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] delta Value to add to the current value. + @param[in] unless Perform the addition only when the current value is not + equal to this unless value. + @return + TRUE -- 1 - Addition was performed. \n + FALSE -- 0 - Addition was not done. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_unless(unsigned int* target, + unsigned int delta, + unsigned int unless) +{ + unsigned int current_val; + unsigned int new_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%3)\n" + " p0 = cmp.eq(%0, %5)\n" + " if p0 jump 2f\n" + " %1 = add(%0, %4)\n" + " memw_locked(%3, p0) = %1\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"=&r" (new_val),"+m" (*target) + : "r" (target), "r" (delta), "r" (unless) + : "p0"); + + return (unsigned int)(current_val != unless); +} + +/**@ingroup func_qurt_atomic_sub + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_sub(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_sub_return + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_sub_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_inc + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_inc(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_inc_return + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_inc_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_dec + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_dec(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_dec_return + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_dec_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_compare_and_set + Compares the current value of the atomic variable with the + specified value and set to a new value when compare is successful. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val Old value to compare. + @param[in] new_val New value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE --Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_compare_and_set(unsigned int* target, + unsigned int old_val, + unsigned int new_val) +{ + unsigned int current_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memw_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (unsigned int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic_barrier + Allows the compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_barrier(void) +{ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); +} + + +/**@ingroup func_qurt_atomic64_set + Sets the 64-bit atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value 64-bit value to set. + + @return + Successfuly set value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_set(unsigned long long* target, unsigned long long value) +{ + unsigned long long tmp; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " memd_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic64_and_return + Bitwise AND operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise AND. + + @return + AND result of 64-bit atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_or + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_or(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_or_return + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_xor_return + Bitwise XOR operation of 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_set_bit + Sets a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_clear_bit + Clears a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_change_bit + Toggles a bit in a 64-bit atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_add(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add_return + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_add_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_sub_return + Subtracts a 64-bit integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_inc + Increments a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_inc(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_inc_return + Increments a 64-bit atomic variable by one + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_inc_return(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_dec_return + Decrements a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_dec_return(unsigned long long *target) +{ + unsigned long long result; + long long minus1 = 0xFFFFFFFFFFFFFFFFLL; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (minus1) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_compare_and_set + Compares the current value of an 64-bit atomic variable with + the specified value and sets to a new value when compare is successful. + + @note1hang The function keep retrying until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val 64-bit old value to compare. + @param[in] new_val 64-bit new value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE -- Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE int +qurt_atomic64_compare_and_set(unsigned long long *target, + unsigned long long old_val, + unsigned long long new_val) +{ + unsigned long long current_val; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memd_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic64_barrier + Allows compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_barrier(void) +{ + /** @cond */ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); + /** @endcond */ +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_barrier.h new file mode 100755 index 0000000000000..7c6f787d43bc2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_barrier.h @@ -0,0 +1,140 @@ +#ifndef QURT_BARRIER_H +#define QURT_BARRIER_H + +/** + @file qurt_barrier.h + @brief Prototypes of Kernel barrier API functions. + + EXTERNALIZED FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup barrier_types +@{ */ +/*===================================================================== + Constants and macros +======================================================================*/ +#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */ +#define QURT_BARRIER_OTHER 0 /**< Other. */ + +#ifndef ASM +#include + +/*===================================================================== +Typedefs +======================================================================*/ + +/** QuRT barrier type. + */ +typedef union { + /** @cond */ + struct { + unsigned short threads_left; + unsigned short count; + unsigned int threads_total; + unsigned int queue; + unsigned int reserved; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_barrier_t; + +/** @} */ /* end_addtogroup barrier_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_init + Initializes a barrier object. + + @datatypes + #qurt_barrier_t + + @param[out] barrier Pointer to the barrier object to initialize. + @param[in] threads_total Total number of threads to synchronize on the barrier. + + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_destroy + Destroys the specified barrier. + + @note1hang Barriers must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Barriers must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to destroy. + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_destroy(qurt_barrier_t *barrier); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_wait + Waits on the barrier.\n + Suspends the current thread on the specified barrier. \n + The function return value indicates whether the thread was the last one to + synchronize on the barrier. + When a thread waits on a barrier, it is suspended on the barrier: \n + - If the total number of threads waiting on the barrier is less than the assigned value + of the barrier, no other action occurs. \n + - If the total number of threads waiting on the barrier equals the assigned value of the + barrier, all threads currently waiting on the barrier are awakened, allowing them to + execute past the barrier. + + @note1hang After its waiting threads are awakened, a barrier is automatically reset + and can be used again in the program without the need for re-initialization. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to wait on. + + @return + #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n + #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_wait(qurt_barrier_t *barrier); + + +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BARRIER_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_busywait.h new file mode 100755 index 0000000000000..a4dab80a2520a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_busywait.h @@ -0,0 +1,62 @@ +#ifndef QURT_BUSYWAIT_H +#define QURT_BUSYWAIT_H + +/** + @file qurt_busywait.h + @brief Implementation of the busywait() function for + hardware based blocking waits that use the QTIMER as a reference. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ============================================================================*/ +/*============================================================================= + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Changes are listed in reverse chronological + * order. + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------------- + * 2018-03-20 pg Add Header file + ============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_busywait + Pauses the execution of a thread for a specified time.\n + Use for small microsecond delays. + + @note1hang The function does not return to the caller until + the time duration has expired. + + @param[in] pause_time_us Time to pause in microseconds. + + @return + None. + + @dependencies + None. + */ +void qurt_busywait (unsigned int pause_time_us); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BUSYWAIT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_callback.h new file mode 100755 index 0000000000000..dc9b896c63454 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_callback.h @@ -0,0 +1,235 @@ +#ifndef QURT_CALLBACK_H +#define QURT_CALLBACK_H + +/** + @file qurt_callback.h + Definitions, macros, and prototypes for QuRT callback framework. + + QDI framework allows the development of root process drivers and services that + a user process client can interact with in a secure manner. QDI framework does + this by elevating the priviledge of user process thread, temporarily allowing + the thread execute in root context and letting it fall back to user context once + the QDI invocation is finished. + + The QuRT callback framework provides a safe mechanism for root process drivers + to execute callback functions in a user process. The framework hosts + dedicated worker threads in corresponding processes that handle the execution + of the callback function. This ensures that the callbacks occur in context of + the appropriate process thread, in result maintaining privilege boundaries. + + Prerequisites for use of this framework are: + 1. Driver is a QDI driver and client communicates with drivers using QDI + invocations. + 2. Appropriate callback configuration is specified in cust_config.xml for + the user process that intends to use this framework. + + qurt_cb_data_t is the public data structure that allows client to store all + the required information about the callback, including the callback function + and the arguments to pass to this function when it executes. + The client uses QDI interface to register this structure with root driver. + + Callback framework provides following APIs that a root driver can use to invoke callback. + These functions are described in qurt_qdi_driver.h header file. + + qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the + invoking thread does not wait for the callback to finish executing. + + qurt_qdi_cb_invoke_sync() triggers a synchronous callback. Upon invocation + the invoking thread gets suspended till the callback function finishes execution. + + qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to + qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with + the callback invocation to be utlized during the callback execution. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int qurt_cb_result_t; + +/* Callback framework error codes. + Callback framework returns a nonzero value if callback invocation is unsuccessful. + Following macros highlight cause of failure in more detail. +*/ +#define QURT_CB_ERROR -1 /* Callback registration failed.\n*/ +#define QURT_CB_OK 0 /* Success.\n*/ +#define QURT_CB_MALLOC_FAILED -2 /* QuRTOS malloc failure.\n*/ +#define QURT_CB_WAIT_CANCEL -3 /* Process exit cancelled wait operation.\n*/ +#define QURT_CB_CONFIG_NOT_FOUND -4 /* Callback configuration for process was not found.\n*/ +#define QURT_CB_QUEUE_FULL -5 /* Callback queue is serving at maximum capacity.*/ +/** @addtogroup cb_types +@{ */ +/** Callback registration data structure. + This data structure is used by a client attempting to register a callback with a QDI driver. + It holds the address of callback function and the argument supplied to the callback + function when it executes. +*/ +typedef struct { + /** @cond */ + void* cb_func; /*< Pointer to the callback function. */ + unsigned cb_arg; /*< Not interpreted by the framework.*/ + /** @endcond */ +} qurt_cb_data_t; + +/** @cond */ +/* Defines used as default if cust_config does not specify them. */ +#define CALLBACK_WORKER_STACK_SIZE 0x2000 +/** @endcond */ +/** @} */ /* end_addtogroup cb_typess */ +/**@ingroup func_qurt_cb_data_init + Initializes the callback data structure. + Entity registering a callback with the root process driver must call this function + to initialize callback registration data structure to the default value. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){ + cb_data->cb_func = NULL; + cb_data->cb_arg = 0; +} + +/**@ingroup func_qurt_cb_data_set_cbfunc + Sets up the callback function in the callback registration data structure. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_func Pointer to the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){ + cb_data->cb_func = cb_func; +} + +/**@ingroup func_qurt_cb_data_set_cbarg + Sets up the callback argument. + This function sets up the argument passed to the callback function when it executes. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_arg Argument for the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){ + cb_data->cb_arg = cb_arg; +} + +/** @cond */ +/**@ingroup driver_support_functions + Invokes an asynchronous callback for a specified process. + A driver that resides in the root process calls this API to launch a callback in + a process described by the client_handle. + After the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is not suspended during the callback execution period. + The API returns immediately with a success/failure error code. + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process. + A driver that resides in a root process calls this API to launch a sync callback in + a process described by the client_handle. + AFter the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is suspended during the callback execution period. + If the process in which to execute the callback exits or terminates, the caller is + woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h). + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process, passing driver data to the user PD. + This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to + the user process as part of the callback invocation. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @param data Driver arbitrary data to pass to the user process. Memory pointed to by data + must be accessible to the user PD. The root driver can allocate such memory by + using qurt_mem_mmap(). + @param data_len Driver arbitrary data length. + + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle, + qurt_cb_data_t* cb_data, + int prio, + void *data, + unsigned data_len + ); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_clade.h new file mode 100755 index 0000000000000..d7442cf98dd94 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_clade.h @@ -0,0 +1,62 @@ +#ifndef QURT_CLADE_H +#define QURT_CLADE_H +/** + @file qurt_clade.h + @brief Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API. + CLADE is a cache line level memory compression system that is used to + decrease DRAM usage. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_clade2_get + Reads the value of the clade2 register. + + @param[in] offset Offset from the clade2 cfg base. + @param[out] *value Pointer to the register value read from the offset. + + @return + #QURT_EOK - Successfully read the value from the register at offset \n + #QURT_EINVALID - Offset passed is incorrect + + @dependencies + None. + */ +int qurt_clade2_get(unsigned short offset, unsigned int *value); + +/**@ingroup func_qurt_clade2_set + Sets the PMU register; only PMU_SEL register can be set. + + @param[in] offset Offset from the QURTK_clade2_cfg_base. + @param[in] value Value to set at offset. + + @return + #QURT_EOK -- Successfully set the value at offset. \n + #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG. + + @dependencies + None. + */ +int qurt_clade2_set(unsigned short offset, unsigned int value); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CLADE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cond.h new file mode 100755 index 0000000000000..6e65ed82a8393 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cond.h @@ -0,0 +1,219 @@ +#ifndef QURT_COND_H +#define QURT_COND_H +/** + @file qurt_cond.h + @brief Prototypes of kernel condition variable object API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup condition_variables_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** QuRT condition variable type. */ +typedef union { + /** @cond */ + unsigned long long raw; + struct { + unsigned int count; + unsigned int n_waiting; + unsigned int queue; + unsigned int reserved; + }X; + /** @endcond */ +} qurt_cond_t; + +/** @} */ /* end_addtogroup condition_variables_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_cond_init + Initializes a conditional variable object. + + @datatypes + #qurt_cond_t + + @param[out] cond Pointer to the initialized condition variable object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_init(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_destroy + Destroys the specified condition variable. + + @note1hang Conditions must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Conditions must not be destroyed while they are still in use. If this occurs, + the behavior of QuRT is undefined. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to destroy. + + @return + None. + + */ +/* ======================================================================*/ +void qurt_cond_destroy(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_signal + Signals a waiting thread that the specified condition is true. \n + + When a thread wishes to signal that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the signal condition operation. \n + -# Unlock the mutex. + + @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_signal(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_broadcast + Signals multiple waiting threads that the specified condition is true.\n + When a thread wishes to broadcast that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the broadcast condition operation. \n + -# Unlock the mutex.\n + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_broadcast(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable (suspends the thread and unlocks the mutex). + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t \n + #qurt_mutex_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait2 + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable, which suspends the thread and unlocks the mutex. + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @note1cont This is the same API as qurt_cond_wait(), use this version + when using mutexes of type #qurt_rmutex2_t. + + @datatypes + #qurt_cond_t \n + #qurt_rmutex2_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with the condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_COND_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_consts.h new file mode 100755 index 0000000000000..b1e35998e73b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_consts.h @@ -0,0 +1,315 @@ +#ifndef QURT_CONSTS_H +#define QURT_CONSTS_H + +/** + @file qurt_consts.h + @brief QuRT constants and definitions + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* Definitions of system events. System events suspend + a thread and put it into suspending_list. + The system event number is saved in CONTEXT::error::cause field + of the suspended thread. An event handler thread such as + page fault handler or system error handler can wake up the suspended + thread. + */ +#define QURT_EVENT_PAGEFAULT 0x1 /* Page fault event. */ +#define QURT_EVENT_SYSTEM_ERR 0x2 /* System error event. */ +#define QURT_EVENT_SUSPEND 0x3 +#define QURT_EVENT_PROCESS_EXIT 0x4 /* Process termination event.*/ + +#define QURT_SYSENV_MAX_THREADS_TYPE 1 /* Maximum threads object. */ +#define QURT_SYSENV_PROCNAME_TYPE 2 /* Process name object. */ +#define QURT_SYSENV_MAX_PI_PRIO_TYPE 3 /* Maximum pi priority object. */ +#define QURT_SYSENV_ARCH_REV_TYPE 4 /* Architecture version object. */ +#define QURT_SYSENV_APP_HEAP_TYPE 5 /* Application heap object. */ +#define QURT_SYSENV_REGION_ATTR_DEFAULT 7 /* Default region attributes. */ +#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE 8 /* Stack profile count type. */ +#define QURT_SYSENV_ISLAND_CONFIG_TYPE 9 /*island configuration check*/ +#define QURT_SYSENV_HTHREADS_TYPE 10 /* Active threads objec */ +#define QURT_SYSENV_CONFIG_IMAGE_START_LO 11 /* Config image start address for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_START_HI 12 /* Config Image start address for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_LO 13 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_HI 14 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_PLATPARAMS 15 /* Platformparams for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_SIZE 16 /* Config image Size for DTB parsing */ +#define QURT_SYSENV_L2_CACHE_LINE_SIZE 17 /*L2 cache line size*/ + +/* Get q6 regs */ +#define QURT_GET_SSR 1 +#define QURT_GET_CCR 2 +#define QURT_GET_CFGBASE 3 +#define QURT_GET_SYSCFG 4 +#define QURT_GET_REV 5 + + +/** @cond rest_reg_dist */ +/** @addtogroup performance_monitor_macros +@{ */ + +/* PMU */ +#define QURT_PMUCNT0 0 /**< */ +#define QURT_PMUCNT1 1 /**< */ +#define QURT_PMUCNT2 2 /**< */ +#define QURT_PMUCNT3 3 /**< */ +#define QURT_PMUCFG 4 /**< */ +#define QURT_PMUEVTCFG 5 /**< */ + +/* new since V55 */ +#define QURT_PMUCNT4 6 /**< */ +#define QURT_PMUCNT5 7 /**< */ +#define QURT_PMUCNT6 8 /**< */ +#define QURT_PMUCNT7 9 /**< */ +#define QURT_PMUEVTCFG1 10 /**< */ + +/* new since V61 */ +#define QURT_PMUSTID0 11 /**< */ +#define QURT_PMUSTID1 12 /**< */ + +#define QURT_PMUCNTSTID0 13 /**< */ +#define QURT_PMUCNTSTID1 14 /**< */ +#define QURT_PMUCNTSTID2 15 /**< */ +#define QURT_PMUCNTSTID3 16 /**< */ +#define QURT_PMUCNTSTID4 17 /**< */ +#define QURT_PMUCNTSTID5 18 /**< */ +#define QURT_PMUCNTSTID6 19 /**< */ +#define QURT_PMUCNTSTID7 20 /**< */ + +/** @} */ /* end_addtogroup performance_monitor_macros */ +/** @endcond */ + +/* + Power collapse operation +*/ +#define QURT_POWER_SHUTDOWN 0 /**< */ +#define QURT_TCXO_SHUTDOWN 1 /**< */ +#define QURT_POWER_CMD_PREPARE 0 /**< */ +#define QURT_POWER_CMD_PERFORM 1 /**< */ +#define QURT_POWER_CMD_EXIT 2 /**< */ +#define QURT_POWER_CMD_FAIL_EXIT 3 /**< */ +#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */ +#define QURT_POWER_CMD_PERFORM_SAVE_TCM 5 /**< */ +#define QURT_POWER_CMD_DEEP_SLEEP 6 /**< */ + + +/** @addtogroup thread_macros +@{ */ +#define QURT_MAX_HTHREAD_LIMIT 8U /**< Limit on the maximum number of hardware threads supported by QuRT for any + Hexagon version. Use this definition to define arrays, and so on, in + target independent code. */ +/** @} */ /* end_addtogroup thread_macros */ + +/** @cond internal_only */ +/** @addtogroup power_management_macros +@{ */ +/** + L2 cache retention mode +*/ +#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_L2RET QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */ +/** @} */ /* end_addtogroup power_management_macros */ +/** @endcond */ + +/* + QURT_system_state + Use for debugging the shutdown/startup process. + + State transition for cold boot: + QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT --> + QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE --> + QURT_CBOOT_ROOT_TASK_STARTED + + State transition for power collapse: + QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND --> + QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC --> + cache flush states (dependent on L2 retention config) + + State transition for warm boot: + QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB --> + QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT +*/ +#define QURT_PREPARE_SINGLE_MODE 1 /**< */ +#define QURT_PREPARE_END 2 /**< */ +#define QURT_PERFORM_IPEND 3 /**< */ +#define QURT_PERFORM_SAVE_ISDP 4 /**< */ +#define QURT_PERFORM_SAVE_PMU 5 /**< */ +#define QURT_PERFORM_SAVE_TLB 6 /**< */ +#define QURT_PERFORM_SWITCH_PC 7 /**< */ +#define QURT_PERFORM_EXIT 8 /**< */ +#define QURT_FLUSH_L1CACHE 9 /**< */ +#define QURT_FLUSH_L2CACHE 0xA /**< */ +#define QURT_FLUSH_CACHE_DONE 0xB /**< */ +#define QURT_SWITCH_PC_DONE 0xC /**< */ +#define QURT_BOOT_SETUP_ISDB 0xD /**< */ +#define QURT_WBOOT_INIT_TLB 0xE /**< */ +#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */ +#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */ +#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */ +#define QURT_CBOOT_BSP_INIT 0x12 /**< */ +#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */ +#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */ +#define QURT_CBOOT_END_OS_INIT 0x15 /**< */ +#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */ +#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */ +#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */ +#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */ +#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */ +#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */ +#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */ +#define QURT_WBOOT_DEBUG_L2_END 0x1D /**< */ +#define QURT_NMI_SAVE_L2VIC_COMPLETE 0x1E /**< */ +#define QURT_NMI_HANDLER_COMPLETE 0x1F /**< */ +#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */ +#define QURT_WBOOT_START 0x21 /**< */ +#define QURT_ENTER_ISLAND 0x22 /**< */ +#define QURT_EXIT_ISLAND 0x23 /**< */ +#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */ +#define QURT_ABNORMAL_RESET 0x25 /**< */ +/* + Thread attributes +*/ + +#define QURT_THREAD_ATTR_GP 0x00000002 /*< */ +#define QURT_THREAD_ATTR_UGP 0x00000003 /*< User general pointer (UGP)*/ +#define QURT_THREAD_ATTR_PREFETCH 0x00000004 /*< */ +#define QURT_THREAD_ATTR_TID 0x00000005 /*< */ +#define QURT_THREAD_ATTR_CACHE_PART 0x00000007 /*< */ +#define QURT_THREAD_ATTR_COPROCESSOR 0x00000008 /*< */ +#define QURT_THREAD_ATTR_GET_L2CACHE_PART 0x00000009 /*< */ +#define QURT_THREAD_ATTR_SET_FRML 0x0000000A /*< */ +#define QURT_THREAD_ATTR_STID_GET 0x0000000B /*< */ +#define QURT_THREAD_ATTR_STID_SET 0x0000000C /*< */ +#define QURT_THREAD_ATTR_AUTOSTACK 0x0000000D /*< */ +#define QURT_THREAD_ATTR_SYSTEM_THREAD 0x0000000E /*< */ +#define QURT_THREAD_ATTR_STID_SET2 0x0000000F /*< */ +#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */ +#define QURT_THREAD_ATTR_STID_GET2 0x00000011 /*< */ + +/** Cache operations*/ +#define QURT_DCCLEAN 0U /* Clean Dcache. */ +#define QURT_DCINV 1U /* Invalidate Dcache. */ +#define QURT_DCCLEANINV 2U /* Clean and invalidate Dcache. */ +#define QURT_ICINV 3U /* Invalidate Icache. */ +#define QURT_DUMP_DCTAGS 4U /* For testing purpose. */ +#define QURT_FLUSH_ALL 5U /* Flush entire L1 and L2 cache. */ +#define QURT_TABLE_FLUSH 6U /* Flush based on table of physical pages */ +#define QURT_CLEAN_INVALIDATE_ALL 7U /* Flush and invalidate entire L1 and L2 cache. */ +#define QURT_L2CACHE_LOCK_LINES 8U /* l2 cache lock lines */ +#define QURT_L2CACHE_UNLOCK_LINES 9U /* l2 cache unlock lines */ +#define QURT_CLEAN 10U /* Flush L1 and L2 cache */ +#define QURT_CLEAN_INVALIDATE 11U /* Flush and invalidate L1 and L2 cache. */ +#define QURT_CLEAN_INVALIDATE_L2 12U /* Flush and invalidate entire L2 cache. */ + +/**@ingroup chapter_prefined_symbols */ +/**@xreflabel{hdr:QURT_API_VERSION}*/ + + +/* Process state. */ +#define QURT_UPDATE_PROCESS_STATE 0 /**< */ +#define QURT_MP_INIT 1 /*< */ +#define QURT_MP_RUNNING 2 /*< */ +#define QURT_MP_STOPPED 3 /*< */ + +/* QuRT reset reason. */ +#define QURT_NORMAL_BOOT 0 /* Normal boot. */ +#define QURT_WARM_BOOT 1 /* Power collapse warm boot. */ +#define QURT_WARM_BOOT_L2_RETENTION 2 /* Power collapse with L2 retention warm boot. */ +#define QURT_WARM_BOOT_SAVE_TCM 3 /* Power collapse with saving TCM. */ +#define QURT_QUICK_BOOT 4 /* Deep sleep. */ + +/* QuRT Wait for Idle command */ +#define QURT_WAIT_FOR_IDLE_DISABLE 0 /*< */ +#define QURT_WAIT_FOR_IDLE_ENABLE 1 /*< */ +#define QURT_WAIT_FOR_IDLE 2 /*< */ +#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */ + +/*QuRT island exit stages */ +#define QURT_ISLAND_EXIT_STAGE1 1 /*< */ +#define QURT_ISLAND_EXIT_STAGE2 2 /*< */ + +#define QURT_MAX_NAME_LEN 64 /*< */ + +#define MAX_POOL_RANGES 16 /*< */ + +/* key definitions for debug thread info */ +//#define MAX_TCB_KEY 40 //whatever is a good number or makes debug thread structure be 1K +#define KEY_SCHDULER_STATE 1 /*< */ +#define KEY_PRIORITY 2 /*< */ +#define KEY_PRIORITY_ORIG 3 /*< */ +#define KEY_STACK_BOTTOM 4 // Currently not populated +#define KEY_STACK_TOP 5 // Currently not populated +#define KEY_HVX_STATE 6 /*< */ +#define KEY_FUTEX_OBJECT 7 /*< */ +#define KEY_THREAD_ID 8 /*< */ +#define KEY_PROFILE_CYCLE_LO 9 // Currently not populated +#define KEY_PROFILE_CYCLE_HI 10 // Currently not populated +#define KEY_ERROR_ADDRESS 11 // This holds the BADVA +#define KEY_ERROR_CAUSE 12 // This is the same as QURT_error_info.cause +#define KEY_ERROR_CAUSE2 13 // This is the same as QURT_error_info.cause2 +#define KEY_ERROR_SSR 14 /*< Holds the SSR value */ +#define QURT_RESERVED -1 + +/* VTLB method IDs. */ +#define QURT_VTLB_ENTRY_CREATE 0U +#define QURT_VTLB_ENTRY_DELETE 1U +#define QURT_VTLB_ENTRY_READ 2U +#define QURT_VTLB_ENTRY_WRITE 3U +#define QURT_VTLB_ENTRY_PROBE 4U +#define QURT_VTLB_ENTRY_SPLIT 5U +#define QURT_VTLB_ENTRY_MERGE 6U +#define QURT_VTLB_ENTRY_STATISTICS 7U +#define QURT_VTLB_ENTRY_SET_SPECIAL 8U +#define QURT_VTLB_QUEUE_PPAGE 9U +#define QURT_VTLB_RECLAIM_STACK_PAGES 10U +#define QURT_VTLB_ASID_SET_STATE_FAST 11U +#define QURT_VTLB_ASID_SET_STATE 12U +#define QURT_VTLB_ENTRY_SET_EXTENSION 13U +#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U + +/* VTCM window access control HWIO programming. */ +#define QURT_VTCM_WINDOW_ENABLE 1U +#define QURT_VTCM_WINDOW_DISABLE 0U +#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT 0xFFFU +#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT 0U + +/** @cond */ +/* ETM source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< Memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< Memory source of SAC* is data. */ + +/* ETM PID status flags */ +#define QURT_ETM_NO_PID 0xFFFFFFFF /**< No PID is selected. */ +/** @endcond */ + +/* execution context */ +#define QURT_CTX_USER 1 +#define QURT_CTX_GUEST 2 + +/* Profiling STID */ +#define QURT_STID_DEFAULT 0U + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cycles.h new file mode 100755 index 0000000000000..b599493f5d563 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cycles.h @@ -0,0 +1,301 @@ + +#ifndef QURT_CYCLES_H +#define QURT_CYCLES_H 1 +/** + @file qurt_cycles.h + Prototypes of kernel pcycle API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ + +/**@ingroup func_qurt_profile_reset_idle_pcycles + @xreflabel{hdr:qurt_profile_reset_idle_pcycles} + Sets the per-hardware-thread idle cycle counts to zero. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_idle_pcycles (void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_thread_pcycles + @xreflabel{hdr:qurt_profile_get_thread_pcycles} + Gets the count of the running processor cycles for the current thread.\n + Returns the current running processor cycle count for the current QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @return + Integer -- Running processor cycle count for current thread. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_profile_get_thread_pcycles(void); + + +/*======================================================================*/ +/**@ingroup func_qurt_get_core_pcycles + @xreflabel{hdr:qurt_get_core_pcycles} + Gets the count of core processor cycles executed.\n + Returns the current number of running processor cycles executed since the Hexagon + processor was last reset. + + This value is based on the hardware core clock, which varies in speed according to the + processor clock frequency. + + @note1hang Because the hardware core clock stops running when the processor shuts + down (due to all of the hardware threads being idle), treat the cycle values returned + by this operation as relative rather than absolute. + + @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version. + + @return + Integer -- Current count of core processor cycles. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_get_core_pcycles(void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles + + @deprecated use #qurt_profile_get_idle_pcycles2 instead + + Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use + #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. + + This operation accepts a pointer to a user-defined array, and writes to the array the current + idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling is enabled or not, + and resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be a minimum of the number of hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_idle_pcycles (unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles2 + Gets the current idle processor cycle counts for maximum available hardware threads. + + This operation accepts a pointer to a user-defined array with length in bytes, and writes + to the array the current idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling enable status, and + resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be equivalent to the number of hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, + it returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles + + @deprecated use #qurt_profile_get_threadid_pcycles2 instead + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for a maximum of 6 hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Valid thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be a minimum of the number of + hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles2 + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for maximum available hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be equivalent to the number of + hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, it + returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long *pcycles, unsigned int length_in_bytes); + + +/*======================================================================*/ +/**@ingroup func_qurt_profile_reset_threadid_pcycles + @xreflabel{hdr:qurt_profile_reset_threadid_pcycles} + Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread. + + @param[in] thread_id Thread identifier. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_threadid_pcycles (int thread_id); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_enable + @xreflabel{hdr:qurt_profile_enable} + Enables profiling.\n + Enables or disables cycle counting of the running and idle processor cycles. + Profiling is disabled by default. \n + + @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be + done explicitly by calling the reset operations before starting cycle counting. + Cycle counting starts from the instant of it was enabled using this API, and + halts on profiling disable. + + @param[in] enable Profiling. Values: \n + - 0 -- Disable profiling \n + - 1 -- Enable profiling @tablebulletend + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_enable (int enable); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_pcycles + @xreflabel{hdr:qurt_get_hthread_pcycles} + Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values are 1 through . + + + @return + Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed + from reset to current point of execution when n threads are in run mode + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_pcycles(int n); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_commits + @xreflabel{hdr:qurt_get_hthread_commits} + Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values: 1 through . + + @return + Value read from the GCOMMIT_nT register. This value indicates the total number of packets + committed from reset to current point of execution when n threads are in run mode. + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_commits(int n); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_devtree.h new file mode 100755 index 0000000000000..4adee45bb44a2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_devtree.h @@ -0,0 +1,161 @@ +#ifndef QURT_DEVTREE_H +#define QURT_DEVTREE_H +/** + @file qurt_devtree.h + @brief Prototypes and structures for device tree aware QuRT library function. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +*/ +/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def. + callback is not used here, so define NULL here to avoid including the world*/ +#ifndef NULL +#define NULL ((void *) 0) +#endif + +#include "libfdt.h" +#include "DTBExtnLib.h" +#include "qurt_qdi_ext.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_BLOB_ID (-1) +#define DEFAULT_BLOB_ID 0 + +/** QURT Device Tree Mapping Macros */ +#define QURT_DT_MAPPING_FAILED (-1) +#define QURT_DT_FLAG_ISLAND 0x1 +#define QURT_DT_FLAG_PHYSADDR 0x2 + +/** Device Tree type for Root PD Device tree. +    Root PD Device Tree will typically describe the hardware in the subsystem. +    This is the /soc portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_ROOT 0 + +/** Device Tree type for Local Device tree. +    Local Device Tree will typically contain the software settings. +    This is the /sw portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_LOCAL 1 + +int qurt_devtree_init(void); + +/**@ingroup func_qurt_dt_mapping_create + Creates a memory mapping from the specified property of the specified device + tree node. Returns virtual addresses and sizes. + + @param[in] offset Device tree node offset. + @param[in] flags Flags to configure memory. Overloaded as property + index if reg_name is NULL. + @param[in] reg_name Identifies property to use for mapping, should + resemble a region. + @param[out] vaddr Return pointer for the virtual region address. + @param[out] size Return pointer for the virtual region size. + + @return + Result code indicating success or failure \n +*/ +int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, + unsigned long long *vaddr, unsigned long long *size); + +/**@ingroup func_qurt_dt_mapping_create2 + + Creates a memory mapping from the specified property of the specified device + tree node. + + Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). + + @param[in] devtreeNode Device Tree node + + @param[in] dt_map_flags Flags to configure memory mapping and are reserved for future purpose. + (0) - Default value assumes details from DT node are phys address, size. + QURT_DT_FLAG_ISLAND + + NOTE: The PA needs to be added to corresponding island spec to create an island mapping + + @param[in] regionName NULL or name of index in range to return, should + resemble a region. Ex.reg-names = "base", "rx", "tx"; + + @param[in] regionIdx Index of range to return. Ex reg = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >; + + NOTE: If client specifies both re_name & regionIdx. The precedence of + region name is taken over and region index is ignored. + + @param[in] dt_map_perm Mapping access permissions(R/W), + QURT_PERM_READ + QURT_PERM_WRITE + + @param[in] cache_attr QuRT cache mode type's : + QURT_MEM_CACHE_DEVICE + QURT_MEM_CACHE_WRITEBACK + Other required cache type enums in qurt_types.h can also be passed. + + NOTE: No default value for cache & perm is present. + Client always needs to pass any of defined the flags. + + @param[out] vaddr Return pointer to the variable that holds the virtual address + @param[out] size Return pointer for the virtual region size. + + @return + #QURT_EOK Success indicating mapping created properly. + #QURT_DT_MAPPING_FAILED Failed to create mapping. + #QURT_EINVALID Mismatch in the architecture. + + else FdtLib or thirdparty error code. + +*/ +int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, + char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size); + +/**@ingroup func_qurt_dt_isr_register + Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. + The interrupt defined in the specified device tree node is enabled when this function returns success. + + @datatypes + #qurt_thread_t \n + #fdt_node_handle + + @param[in] dt_node Device tree node that specifies the interrupt property. + @param[in] dt_int_index Index of the specific interrupt to use within the device tree node structure. + Specify either this or int_name, use -1 if string is used. + @param[in] dt_int_name Name of the specific interrupt to use within the device tree node structure. + Either this or int_index should be specified, use NULL if index is used + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2(). + @param[in] prio Priority of the ISR, defined by qurt_isr_register2(). + @param[in] flags Defines ACK type. Values : \n + #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the kernel. + #QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + Defined by qurt_isr_register2(). + @param[in] isr ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2(). + @param[in] arg First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2(). + + @return + #QURT_EOK -- Successfully registered the ISR for the interrupt \n + #QURT_EINT -- Interrupt not configured \n + #QURT_EINVALID -- Invalid thread ID \n + #QURT_EDISABLED -- The feature is disabled \n + #QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Create the thread ID qurt_isr_create(). + ISR registration completed with qurt_isr_register2(). + */ +int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, + unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_dt_blob_id_get + Returns the Blob ID for the Blob type passed. + The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs. + + @param[in] blob_type  Blob type to look up. + @return Blob ID for the passed Blob Type. +*/ +int qurt_dt_blob_id_get(unsigned int blob_type); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ecc.h new file mode 100755 index 0000000000000..09312684e99af --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ecc.h @@ -0,0 +1,168 @@ +#ifndef QURT_ECC_H +#define QURT_ECC_H + + +/*===================================================================== + + @file qurt_ecc.h + @brief Prototypes of QuRT memory ECC API functions + + Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup exception_handling_types +@{ */ +// ECC memory definition +typedef enum { + QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */ + QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/ + QURT_ECC_MEM_L2_CACHE = 2, /**< ECC memory L2 Cache.*/ + QURT_ECC_MEM_VTCM = 3 /**< ECC memory VTCM.*/ +} qurt_ecc_memory_t; +/** @} */ /* end_addtogroup exception_handling_types */ + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup exception_handling_macros +@{ */ + +#define QURT_ECC_ERR_DETECTED_STATUS 0 /**< ECC error detected. */ +#define QURT_ECC_ERR_TYPE 1 /**< ECC error type.*/ +// ECC status type + +#define QURT_ECC_CORRECTABLE_COUNT (1<<0) /**< ECC correctable count.*/ +#define QURT_ECC_UNCORRECTABLE_COUNT (1<<1) /**< ECC uncorrectable count.*/ +#define QURT_ECC_REGION_LOGGING (1<<2) /**< ECC region logging.*/ +// ECC enable/disable definition + +#define QURT_ECC_PROTECTION_DISABLE (0<<0) /**< Bit 0. */ +#define QURT_ECC_PROTECTION_ENABLE (1<<0) /**< Bit 0. */ +/** @} */ /* end_addtogroup exception_handling_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_ecc_enable + Enables or disables ECC protection on a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] enable Set to one of the following values: + - #QURT_ECC_PROTECTION_ENABLE + - #QURT_ECC_PROTECTION_DISABLE @tablebulletend + + @return + - #QURT_EOK -- ECC enabling or disabling setup is performed successfully + - Others -- Failure + + @dependencies + None. + */ +int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable ); + + +/**@ingroup func_qurt_ecc_get_error_status + Gets ECC error status for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following: + - #QURT_ECC_ERR_DETECTED_STATUS + - #QURT_ECC_ERR_TYPE @tablebulletend + + @return + Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS: + - 0 -- No error detected \n + - 1 -- At least one error detected \n + Returns the following when the type is #QURT_ECC_ERR_TYPE: \n + - 0 through 1 -- Correctable error \n + - 2 -- Uncorrectable error + + @dependencies + None. + */ +int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_get_error_count + Gets the ECC error count for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values:\n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT @tablebulletend + + @return + Error count for the specified error type. + + @dependencies + None. + */ +int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_clear_error_count + Clears ECC error count or region logging for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: \n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one or multiple OR'ed of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT \n + - #QURT_ECC_REGION_LOGGING @tablebulletend + + @return + #QURT_EOK -- Error count successfully cleared \n + Others -- Failure at clearing the error count + + @dependencies + None. + */ +int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type ); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ECC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_error.h new file mode 100755 index 0000000000000..f4666b396c378 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_error.h @@ -0,0 +1,149 @@ +#ifndef QURT_ERROR_H +#define QURT_ERROR_H + +/** + @file qurt_error.h + Error results- QURT defines a set of standard symbols for the error result values. This file lists the + symbols and their corresponding values. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021-2022 , 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ +#include "qurt_except.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup chapter_error +@{ */ + +/*===================================================================== +Constants and macros +======================================================================*/ +#define QURT_EOK 0 /**< Operation successfully performed. */ +#define QURT_EVAL 1 /**< Wrong values for the parameters. The specified page does not exist. */ +#define QURT_EMEM 2 /**< Not enough memory to perform the operation.*/ + +#define QURT_EINVALID 4 /**< Invalid argument value; invalid key. */ +/** @cond */ +#define QURT_EUNKNOWN 6 /**< Defined but never used in QuRT. */ +#define QURT_ENOMSGS 7 /**< Message queue is empty. */ +#define QURT_EBADF 9 /**< Bad message queue descriptor. */ +/** @endcond */ +#define QURT_EFAILED 12 /**< Operation failed. */ + +#define QURT_ENOTALLOWED 13 /**< Operation not allowed. */ + +/** @cond */ +#define QURT_EDUPCLSID 14 /*< Duplicate class ID. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOREGISTERED 20 /**< No registered interrupts.*/ +/** @endcond */ + + +/** @cond */ +#define QURT_EISDB 21 /*< Power collapse failed due to ISDB being enabled. */ +#define QURT_ESTM 22 /*< Power collapse failed in a Single-threaded mode check. */ +/** @endcond */ + + +/** @cond rest_reg_dist */ +#define QURT_ETLSAVAIL 23 /**< No free TLS key is available. */ +#define QURT_ETLSENTRY 24 /**< TLS key is not already free. */ +/** @endcond */ + +#define QURT_EINT 26 /**< Invalid interrupt number (not registered). */ +/** @cond rest_reg_dist */ +#define QURT_ESIG 27 /**< Invalid signal bitmask (cannot set more than one signal at a time). */ +/** @endcond */ + +/** @cond */ +#define QURT_EHEAP 28 /**< No heap space is available. */ +#define QURT_ENOSPC 28 /**< No space to create another queue in the system. */ +#define QURT_EMEMMAP 29 /**< Physical address layout is not supported by the kernel. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOTHREAD 30 /**< Thread no longer exists. */ +/** @endcond */ +/** @cond */ +#define QURT_EL2CACHE 31 /**< L2cachable is not supported in kernel invalidate/cleaninv. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_EALIGN 32 /**< Not aligned. */ +#define QURT_EDEREGISTERED 33 /**< Interrupt is already deregistered.*/ +/** @endcond */ + +/** @cond internal_only */ + +#define QURT_ETLBCREATESIZE 34 /**< TLB create error -- Incorrect size.*/ +#define QURT_ETLBCREATEUNALIGNED 35 /**< TLB create error -- Unaligned address.*/ +/** @endcond */ +/** @cond rest_reg_dist*/ +#define QURT_EEXISTS 35 /**< File or message queue already exists. */ +#define QURT_ENAMETOOLONG 36 /**< Name too long for message queue creation. */ +#define QURT_EPRIVILEGE 36 /**< Caller does not have privilege for this operation.*/ + +#define QURT_ECANCEL 37 /**< A cancellable request was canceled because the associated process was asked to exit.*/ +/** @endcond */ + +/** @cond */ +#define QURT_EISLANDTRAP 38 /*< Unsupported TRAP is called in Island mode.*/ + +#define QURT_ERMUTEXUNLOCKNONHOLDER 39 /*< Rmutex unlock by a non-holder.*/ +#define QURT_ERMUTEXUNLOCKFATAL 40 /*< Rmutex unlock error, all except the non-holder error.*/ +#define QURT_EMUTEXUNLOCKNONHOLDER 41 /*< Mutex unlock by a non-holder.*/ +#define QURT_EMUTEXUNLOCKFATAL 42 /*< Mutex unlock error, all except the non-holder error.*/ +#define QURT_EINVALIDPOWERCOLLAPSE 43 /*< Invalid power collapse mode requested. */ +/** @endcond */ +#define QURT_EISLANDUSEREXIT 44 /**< User call has resulted in island exit.*/ +#define QURT_ENOISLANDENTRY 45 /**< Island mode had not yet been entered.*/ +#define QURT_EISLANDINVALIDINT 46 /**< Exited Island mode due to an invalid island interrupt.*/ +/** @cond rest_reg_dist */ +#define QURT_ETIMEDOUT 47 /**< Operation timed-out. */ +#define QURT_EALREADY 48 /**< Operation already in progress. */ +/** @endcond */ + +#define QURT_ERETRY 49 /*< Retry the operation. */ +#define QURT_EDISABLED 50 /*< Resource disabled. */ +#define QURT_EDUPLICATE 51 /*< Duplicate resource. */ +#define QURT_EBADR 53 /*< Invalid request descriptor. */ +#define QURT_ETLB 54 /*< Exceeded maximum allowed TLBs. */ +#define QURT_ENOTSUPPORTED 55 /*< Operation not supported. */ +/** @cond rest_reg_dist */ +#define QURT_ENORESOURCE 56 /**< No resource. */ +/** @endcond */ + +#define QURT_EDTINIT 57 /**< Problem with device tree intialization. */ +#define QURT_EBUFLOCK 58 /*< Buffer lock failed because it was already locked many times. */ +#define QURT_ELOCKED 59 /**< Current operation failed as the buffer is locked. */ +#define QURT_EMSGSIZE 90 /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */ + + +#define QURT_ENOTCONFIGURED 91 /*< Interrupt is NOT configured. */ + +#define QURT_EBANDWIDTHLIMIT 92 /*< Message queue send exceed the bandwidth limit. */ + +#define QURT_ECFIVIOLATION 93 /*< CFI violation detected. */ + +#define QURT_EDESTROY 94 /**< A destroy request was made to waiting threads.*/ + +#define QURT_EHMXNOTAVAIL 95 /**< HMX is not available to target thread.*/ +#define QURT_EHMXNOTDETACHABLE 96 /**< HMX is not detachable from target thread.*/ + +#define QURT_EFATAL -1 /**< Fatal error. */ + +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ERROR_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_event.h new file mode 100755 index 0000000000000..987f0fe79f227 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_event.h @@ -0,0 +1,452 @@ +#ifndef QURT_EVENT_H +#define QURT_EVENT_H +/** + @file qurt_event.h + @brief Prototypes of kernel event API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "qurt_consts.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * System environment object type. + */ +/**@addtogroup sys_env_types +@{ */ +/** QuRT swap pool information type. */ +typedef struct qurt_sysenv_swap_pools { + /** @cond */ + unsigned int spoolsize; /* Swap pool size.*/ + unsigned int spooladdr; /* Swap pool start address.*/ + /** @endcond */ +}qurt_sysenv_swap_pools_t; + +/**QuRT application heap information type. */ +typedef struct qurt_sysenv_app_heap { + /** @cond */ + unsigned int heap_base; /* Heap base address.*/ + unsigned int heap_limit; /* Heap end address.*/ + /** @endcond */ +} qurt_sysenv_app_heap_t ; + +/** QuRT architecture version information type. */ +typedef struct qurt_sysenv_arch_version { + /** @cond */ + unsigned int arch_version; /*Architecture version.*/ + /** @endcond */ +}qurt_arch_version_t; + +/** QuRT maximum hardware threads information type. */ +typedef struct qurt_sysenv_max_hthreads { + /** @cond */ + unsigned int max_hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_max_hthreads_t; + +/** QuRT active hardware threads information type. */ +typedef struct qurt_sysenv_hthreads { + /** @cond */ + unsigned int hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_hthreads_t; + +/** QuRT maximum pi priority information type. */ +typedef struct qurt_sysenv_max_pi_prio { + /** @cond */ + unsigned int max_pi_prio; /*Maximum pi priority.*/ + /** @endcond */ +}qurt_sysenv_max_pi_prio_t; + +/** QuRT process name information type. */ +typedef struct qurt_sysenv_procname { + /** @cond */ + union { + unsigned int asid; /*Address space ID.*/ + unsigned int pid; /*Process ID.*/ + }; + char name[QURT_MAX_NAME_LEN]; /* Process name.*/ + /** @endcond */ +}qurt_sysenv_procname_t; + +/** QuRT stack profile count information type. */ +typedef struct qurt_sysenv_stack_profile_count { + /** @cond */ + unsigned int count; /*Stack profile count for usage.*/ + unsigned int count_watermark; /*Stack profile count for watermark.*/ + /** @endcond */ +}qurt_sysenv_stack_profile_count_t; + +/** + QuRT system error event type. + */ +typedef struct _qurt_sysevent_error_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + } qurt_sysevent_error_t ; + +typedef struct _qurt_sysevent_error_1_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + unsigned int fkey; /**< Framekey.*/ + unsigned int reserved1; /**< Reserved.*/ + unsigned int reserved2; /**< Reserved.*/ + unsigned int reserved3; /**< Reserved.*/ + } qurt_sysevent_error_1_t ; + +/** QuRT page fault error event information type. */ +typedef struct qurt_sysevent_pagefault { + qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */ + unsigned int fault_addr; /**< Accessed address that caused the page fault. */ + unsigned int ssr_cause; /**< SSR cause code for the page fault. */ +} qurt_sysevent_pagefault_t ; +/** @} */ /* @endaddtogroup sys_env_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/*======================================================================*/ +/** + Gets the environment swap pool 0 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools ); + +/* + Gets the environment swap pool 1 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools ); + +/**@ingroup func_qurt_sysenv_get_app_heap + Gets information on the program heap from the kernel. + + @datatypes + #qurt_sysenv_app_heap_t + + @param[out] aheap Pointer to information on the program heap. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap ); + +/**@ingroup func_qurt_sysenv_get_arch_version + Gets the Hexagon processor architecture version from the kernel. + + @datatypes + #qurt_arch_version_t + + @param[out] vers Pointer to the Hexagon processor architecture version. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter + + @dependencies + None. +*/ +int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers); + +/**@ingroup func_qurt_sysenv_get_max_hw_threads + Gets the maximum number of hardware threads supported in the Hexagon processor. + The API includes the disabled hardware threads to reflect the maximum + hardware thread count. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, only HW0 and HW2 are initialized by QuRT. + HW1 and HW3 are not used at all. Under such a scenario, + qurt_sysenv_get_max_hw_threads() still returns four. + + @datatypes + #qurt_sysenv_max_hthreads_t + + @param[out] mhwt Pointer to the maximum number of hardware threads supported in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_hw_threads + Gets the number of hardware threads initialized by QuRT in Hexagon processor. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, QuRT only initializes HW0 and HW2. + HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2. + + @datatypes + #qurt_sysenv_hthreads_t + + @param[out] mhwt Pointer to the number of hardware threads active in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_max_pi_prio + Gets the maximum priority inheritance mutex priority from the kernel. + + @datatypes + #qurt_sysenv_max_pi_prio_t + + @param[out] mpip Pointer to the maximum priority inheritance mutex priority. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip ); + +/**@ingroup func_qurt_sysenv_get_process_name2 + Gets information on the system environment process names based on the client_handle argument. + + @datatypes + #qurt_sysenv_procname_t + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_process_name + Gets information on the system environment process names from the kernel. + + @datatypes + #qurt_sysenv_procname_t + + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_stack_profile_count + Gets information on the stack profile count from the kernel. + + @datatypes + #qurt_sysenv_stack_profile_count_t + + @param[out] count Pointer to information on the stack profile count. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count ); + +/**@ingroup func_qurt_exception_wait + Registers the program exception handler. + This function assigns the current thread as the QuRT program exception handler and suspends the + thread until a program exception occurs. + + When a program exception occurs, the thread is awakened with error information + assigned to the parameters of this operation. + + @note1hang If no program exception handler is registered, or if the registered handler + calls exit, QuRT raises a kernel exception. + If a thread runs in Supervisor mode, any errors are treated as kernel + exceptions. + + @param[out] ip Pointer to the instruction memory address where the exception occurred. + @param[out] sp Stack pointer. + @param[out] badva Pointer to the virtual data address where the exception occurred. + @param[out] cause Pointer to the QuRT error result code. + + @return + Registry status: \n + Thread identifier -- Handler successfully registered. \n + #QURT_EFATAL -- Registration failed. + + @dependencies + None. +*/ +unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp, + unsigned int *badva, unsigned int *cause); + +unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err); + +/**@ingroup func_qurt_exception_wait3 + Registers the current thread as the QuRT program exception handler, and suspends the thread until a + program exception occurs. + When a program exception occurs, the thread is awakened with error information assigned to the specified + error event record. + If a program exception is raised when no handler is registered (or when a handler is registered, but it calls + exit), the exception is treated as fatal.\n + @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n + @note1cont This function differs from qurt_exception_wait() by returning the error information in a data + structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR). + + @param[out] sys_err Pointer to the qurt_sysevent_error_1_t type structure. + @param[in] sys_err_size Size of the qurt_sysevent_error_1_t structure. + + @return + Registry status: \n + - #QURT_EFATAL -- Failure. \n + - Thread ID -- Success. + + @dependencies + None. +*/ + +unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size); + +/**@ingroup func_qurt_exception_raise_nonfatal + Raises a nonfatal program exception in the QuRT program system. + + For more information on program exceptions, see Section @xref{dox:exception_handling}. + + This operation never returns -- the program exception handler is assumed to perform all + exception handling before terminating or reloading the QuRT program system. + + @note1hang The C library function abort() calls this operation to indicate software + errors. + + @param[in] error QuRT error result code (Section @xref{dox:error_results}). + + @return + Integer -- Unused. + + @dependencies + None. +*/ +int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn)); + + +/**@ingroup func_qurt_exception_raise_fatal + Raises a fatal program exception in the QuRT system. + + Fatal program exceptions terminate the execution of the QuRT system without invoking + the program exception handler. + + For more information on fatal program exceptions, see Section @xref{dox:exception_handling}. + + This operation always returns, so the calling program can perform the necessary shutdown + operations (data logging, on so on). + + @note1hang Context switches do not work after this operation has been called. + + @return + None. + + @dependencies + None. +*/ +void qurt_exception_raise_fatal (void); + +unsigned int qurt_enable_floating_point_exception(unsigned int mask); + +/**@ingroup func_qurt_exception_enable_fp_exceptions + Enables the specified floating point exceptions as QuRT program exceptions. + + The exceptions are enabled by setting the corresponding bits in the Hexagon + control user status register (USR). + + The mask argument specifies a mask value identifying the individual floating + point exceptions to set. The exceptions are represented as defined symbols + that map into bits 0 through 31 of the 32-bit flag value. + Multiple floating point exceptions are specified by OR'ing together the individual + exception symbols.\n + @note1hang This function must be called before performing any floating point operations. + + @param[in] mask Floating point exception types. Values: \n + - #QURT_FP_EXCEPTION_ALL \n + - #QURT_FP_EXCEPTION_INEXACT \n + - #QURT_FP_EXCEPTION_UNDERFLOW \n + - #QURT_FP_EXCEPTION_OVERFLOW \n + - #QURT_FP_EXCEPTION_DIVIDE0 \n + - #QURT_FP_EXCEPTION_INVALID @tablebulletend + + @return + Updated contents of the USR. + + @dependencies + None. +*/ + +static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask) +{ + return qurt_enable_floating_point_exception(mask); +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EVENT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_except.h new file mode 100755 index 0000000000000..e1684c80e3d50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_except.h @@ -0,0 +1,185 @@ +#ifndef QURT_EXCEPT_H +#define QURT_EXCEPT_H + +/** + @file qurt_except.h + @brief Defines Cause and Cause2 codes for error-handling. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. + + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + QuRT supports error handling to handle CPU detected exceptions and software errors. + QuRT treats all errors as either fatal errors or nonfatal errors. + + @section sec1 Fatal errors + All supervisor mode exceptions are treated as fatal errors. + If a registered exception handler calls qurt_exit(), it is treated as a fatal error. + Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. + All hardware threads are eventually stopped and the cache is flushed. + NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n + + @subsection subsection1 Debugging fatal errors + - QURT_error_info.status.status -- Indicates that an error occured. + - QURT_error_info.status.cause -- Cause code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.cause2 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.fatal -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered. + - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error. + - QURT_error_info.global_regs -- Contains the values of the global registers of Q6 + - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error. + + + + @subsection subsection2 Debugging nonfatal errors + - QURT_error_info.user_errors -- All user errors are logged here. + - QURT_error_info.user_errors.counter -- Index to last logged error. + - QURT_error_info.user_errors.entry[0...counter] -- Structure for logged error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb -- TCB for the user error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID. + - QURT_error_info.user_errors.entry[0...counter].error_code -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below. + - QURT_error_info.user_errors.entry[0...counter].hw_thread -- Hardware thread ID for error. + - QURT_error_info.user_errors.entry[0...counter].pcycle -- Pcycle for error. + +@note + Important usage note: + Cause and Cause2 are error codes to distinguish multiple errors. + SSR and BADAVA are inconclusive without the vector number. + All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code. + Hence the system can have up to 255 * 255 unique error codes. + The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) ) + Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes. + SSR cause codes are defined in Hexagon reference manual. + All possible combinations are listed below. +*/ +/** @addtogroup chapter_error +@{ */ +/* cause - error type - 8-bits*/ +#define QURT_EXCEPT_PRECISE 0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/ +#define QURT_EXCEPT_NMI 0x02U /**< NMI occurred; Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS 0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_RSVD_VECTOR 0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */ +#define QURT_EXCEPT_ASSERT 0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below. */ +#define QURT_EXCEPT_BADTRAP 0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */ +#define QURT_EXCEPT_UNDEF_TRAP1 0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */ +#define QURT_EXCEPT_EXIT 0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */ +#define QURT_EXCEPT_TLBMISS_X 0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */ +#define QURT_EXCEPT_STOPPED 0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */ +#define QURT_EXCEPT_FATAL_EXIT 0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */ +#define QURT_EXCEPT_INVALID_INT 0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */ +#define QURT_EXCEPT_FLOATING_POINT 0x0EU /**< Kernel received an floating point error. Cause2 is not defined. */ +#define QURT_EXCEPT_DBG_SINGLE_STEP 0x0FU /**< Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS_RW_ISLAND 0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */ +#define QURT_EXCEPT_TLBMISS_X_ISLAND 0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_SYNTHETIC_FAULT 0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */ +#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */ +#define QURT_EXCEPT_UNDEF_TRAP0 0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */ +#define QURT_EXCEPT_PRECISE_DMA_ERROR 0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */ + +#define QURT_ECODE_UPPER_LIBC (0U << 16) /**< Upper 16 bits is 0 for libc. */ +#define QURT_ECODE_UPPER_QURT (0U << 16) /**< Upper 16 bits is 0 for QuRT. */ +#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16) /**< Upper 16 bits is 2 for error service. */ +/** @cond */ +#define QURT_ECODE_ISLAND_INVALID_QDI 3U /**< Passing invalid QDI method in island. */ +/** @endcond */ + +/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */ +#define QURT_SYNTH_ERR 0x01U /**< */ +#define QURT_SYNTH_INVALID_OP 0x02U /**< */ +#define QURT_SYNTH_DATA_ALIGNMENT_FAULT 0x03U /**< */ +#define QURT_SYNTH_FUTEX_INUSE 0x04U /**< */ +#define QURT_SYNTH_FUTEX_BOGUS 0x05U /**< */ +#define QURT_SYNTH_FUTEX_ISLAND 0x06U /**< */ +#define QURT_SYNTH_FUTEX_DESTROYED 0x07U /**< */ +#define QURT_SYNTH_PRIVILEGE_ERR 0x08U /**< */ + +/* Cause2 - Abort cause reason - 8 bits */ +/* ERR_ASSERT cause */ +#define QURT_ABORT_FUTEX_WAKE_MULTIPLE 0x01U /**< Abort cause - futex wake multiple. */ +#define QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE 0x02U /**< Abort cause - thread waiting to wake up in Single Threaded mode. */ +#define QURT_ABORT_TCXO_SHUTDOWN_NOEXIT 0x03U /**< Abort cause - call TCXO shutdown without exit. */ +#define QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL 0x04U /**< Abort cause - futex allocation queue failure - QURTK_futexhash_lifo empty. */ +#define QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT 0x05U /**< Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */ +#define QURT_ABORT_THREAD_SCHEDULE_SANITY 0x06U /**< Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */ +#define QURT_ABORT_REMAP 0x07U /**< Remap in the page table; the correct behavior must remove mapping if necessary. */ +#define QURT_ABORT_NOMAP 0x08U /**< No mapping in page table when removing a user mapping. */ +#define QURT_ABORT_OUT_OF_SPACES 0x09U +#define QURT_ABORT_INVALID_MEM_MAPPING_TYPE 0x0AU /**< Invalid memory mapping type when creating qmemory. */ +#define QURT_ABORT_NOPOOL 0x0BU /**< No pool available to attach. */ +#define QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM 0x0CU /**< Cannot allocate more futex waiting queue. */ +#define QURT_ABORT_ARG_ERROR 0x0DU +#define QURT_ABORT_ASSERT 0x0EU /**< Assert abort. */ +#define QURT_ABORT_FATAL 0x0FU /**< Fatal error; must never occur. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE 0x10U /**< Abort cause - invalid queue ID in futex resume. */ +#define QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE 0x11U /**< Abort cause - invalid queue ID in futex wait. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX 0x12U /**< Abort cause - invalid futex object in hashtable. */ +#define QURT_ABORT_NO_ERHNDLR 0x13U /**< No registered error handler. */ +#define QURT_ABORT_ERR_REAPER 0x14U /**< Exception in the reaper thread. */ +#define QURT_ABORT_FREEZE_UNKNOWN_CAUSE 0x15U /**< Abort in thread freeze operation. */ +#define QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE 0x16U /**< During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */ +#define QURT_ABORT_ERR_ISLAND_EXP_HANDLER 0x17U /**< Exception in Island exception handler task. */ +#define QURT_ABORT_L2_TAG_DATA_CHECK_FAIL 0x18U /**< Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */ +#define QURT_ABORT_ERR_SECURE_PROCESS 0x19U /**< Abort error in secure process. */ +#define QURT_ABORT_ERR_EXP_HANDLER 0x20U /**< No exception handler, or the handler caused an exception. */ +#define QURT_ABORT_ERR_NO_PCB 0x21U /**< PCB of the thread context failed initialization, PCB was NULL. */ +#define QURT_ABORT_NO_PHYS_ADDR 0x22U /**< Unable to find the physical address for the virtual address. */ +#define QURT_ABORT_OUT_OF_FASTINT_CONTEXTS 0x23U /**< Fast interrupt contexts exhausted. */ +#define QURT_ABORT_CLADE_ERR 0x24U /**< Fatal error seen with CLADE interrupt. */ +#define QURT_ABORT_ETM_ERR 0x25U /**< Fatal error seen with ETM interrupt. */ +#define QURT_ABORT_ECC_DED_ASSERT 0x26U /**< ECC two-bit DED error. */ +#define QURT_ABORT_VTLB_ERR 0x27U /**< Fatal error in the VTLB layer. */ +#define QURT_ABORT_TLB_ENCODE_DECODE_FAILURE 0x28U /**< Failure during the TLB encode or decode operation. */ +#define QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE 0x29U /**< Failure to lookup entry in the page table. */ +#define QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE 0x30U /**< Failure to claim phy memory ownership. */ +#define QURT_ABORT_JTLB_SIZE_CHECK_FAIL 0x31U /**< JTLB size configured is more than actual size in hardware */ +#define QURT_ABORT_AUTOSTACK_ASSERT 0x32U /**< Error while handling stack flimit exception. */ + +/* Cause2 - TLB-miss_X - 8bits */ +#define QURT_TLB_MISS_X_FETCH_PC_PAGE 0x60U /**< */ +#define QURT_TLB_MISS_X_2ND_PAGE 0x61U /**< */ +#define QURT_TLB_MISS_X_ICINVA 0x62U /**< */ + +/* Cause2 - TLB-miss_RW - 8bits */ +#define QURT_TLB_MISS_RW_MEM_READ 0x70U /**< */ +#define QURT_TLB_MISS_RW_MEM_WRITE 0x71U /**< */ + +/** @cond rest_reg_dist */ +/* Cause2 - Floating point exception - 8 bits */ +#define QURT_FLOATING_POINT_EXEC_ERR 0xBFU /**< Execute floating-point. */ +/** @endcond */ + +/** Cause2 - autostackv2 - 8 bits */ +#define QURT_AUTOSTACKV2_CANARY_NOT_MATCH 0xC1U +#define QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE 0xC2U + +/** Cause2 - CFI violation - 8 bits */ +#define QURT_CFI_VIOLATION 0xC3U + +/** @cond rest_reg_dist*/ +/* Enable floating point exceptions */ +#define QURT_FP_EXCEPTION_ALL 0x1FU << 25 /**< */ +#define QURT_FP_EXCEPTION_INEXACT 0x1U << 29 /**< */ +#define QURT_FP_EXCEPTION_UNDERFLOW 0x1U << 28 /**< */ +#define QURT_FP_EXCEPTION_OVERFLOW 0x1U << 27 /**< */ +#define QURT_FP_EXCEPTION_DIVIDE0 0x1U << 26 /**< */ +#define QURT_FP_EXCEPTION_INVALID 0x1U << 25 /**< */ + +/** @endcond */ +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EXCEPT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fastint.h new file mode 100755 index 0000000000000..ea65dc0917fc0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fastint.h @@ -0,0 +1,71 @@ +#ifndef QURT_FASTINT_H +#define QURT_FASTINT_H + +/** + @file qurt_fastint.h + @brief QuRT fast interrupt functions + + Copyright (c) 2013-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_fastint_register + Register fast interrupt callback function + + Fast interrupt callback should be designed to perform the minimal necessary + actions for the interrupt, and/or perform some operations, such as signaling + another regular software thread to start any additional processing. + The callback should be a fast and short function. When a fast interrupt callback + is running, the corresponding interrupt cannot be re-enabled until the callback + returns. + + The fast interrupt callback must not use any system blocking calls, such as + mutex lock or signal wait. Otherwise, it results in errors. + + The fast interrupt callback function has a single integer argument and the + function ends with no return. The argument value passed in is the interrupt + number, and therefore a single callback function can handle + multiple fast interrupts. + + @param[in] intno Interrupt number to register. + @param[in] fn Interrupt callback function. + + @return + #QURT_EOK -- Fast interrupt registration is successful. \n + #QURT_EINVALID -- Interrupt is already registered. \n + #QURT_EINT -- Invalid interrupt number. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_register(int intno, void (*fn)(int)); + + +/*======================================================================*/ +/**@ingroup func_qurt_fastint_deregister + Deregisters the fast interrupt callback function. + + @param[in] intno Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 + (simulator only). + + @return + #QURT_EOK -- Interrupt deregistration is successful. \n + #QURT_EINT -- Invalid interrupt number (not registered). \n + #QURT_EINVALID -- Invalid interrupt number (already deregistered). + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_deregister(int intno); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FASTINT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fs_hub.h new file mode 100755 index 0000000000000..aaa050a6c838b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fs_hub.h @@ -0,0 +1,58 @@ +#ifndef QURT_FS_HUB_H +#define QURT_FS_HUB_H + +/** + @file qurt_fs_hub.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver that provides file-system functionality. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + This structure tracks a file-designator for a FS-hub QDI driver. + File system's QDI interface should use this object to encapsulate + true file-descriptor and return back a QDI handle. This QDI handle + will be used as file-descriptor by File-systm-hub. + */ + +typedef struct qurt_qdi_fs_obj +{ + qurt_qdi_obj_t qdi_obj; + int client_handle; + int fd; +}qurt_qdi_fs_obj_t; + + +/**@ingroup fs_hub_support_functions + This function allows a file-system to register it's QDI interface with file-system-hub. + Once registered, all file open operations for any filenames containing the mountpoint will + be forwarded to the QDI inteface. + + Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/" + + @param mtpoint mount point for the file-system being registered. + @param opener opener structure for the QDI driver interface + + @return + QURT_EOK -- Successfully registered QDI driver with file-system-hub. + Negative error code -- Failed to register with file-system-hub + */ +int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_futex.h new file mode 100755 index 0000000000000..1fdcc79a43f01 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_futex.h @@ -0,0 +1,82 @@ +#ifndef QURT_FUTEX_H +#define QURT_FUTEX_H +/** + @file qurt_futex.h + + @brief Prototypes of QuRT futex API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Functions +======================================================================*/ + + +/**@ingroup func_qurt_futex_wait + Moves the caller thread into waiting state when a memory object address + contains a value that is the same as a specified value. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait(void *lock, int val); + + +/**@ingroup func_qurt_futex_wait_cancellable + If a memory object address contains a value that is same as a specified + value, move the caller thread into waiting state. + The kernal can cancel the waiting state when there is a special need. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait_cancellable(void *lock, int val); + + +/**@ingroup func_qurt_futex_wake + Wakes up a specified number of threads that have been waiting + for the object change with qurt_futex_wait(). + + @param[in] lock Pointer to the object memory. + @param[in] n_to_wake Maximum number of threads to wake up. + + @return + number of threads to be woken up by this function + + @dependencies + None. + */ +int qurt_futex_wake(void *lock, int n_to_wake); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hmx.h new file mode 100755 index 0000000000000..e4037dbeae514 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hmx.h @@ -0,0 +1,226 @@ +#ifndef QURT_HMX_H +#define QURT_HMX_H +/** + @file qurt_hmx.h + @brief Prototypes of Qurt HMX API. + +Copyright (c) 2019-2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + + +/** @addtogroup hmx_types +@{ */ +/* HMX locking type */ +#define QURT_HMX_NON_SHARED_LOCK 0U /**< HMX locking type.*/ +#define QURT_HMX_SHARED_LOCK 1U /**< HMX locking type.*/ + +/* HMX unlocking type */ +#define QURT_HMX_NON_SHARED_UNLOCK 0U /**< HMX unlocking type.*/ +#define QURT_HMX_SHARED_UNLOCK 1U /**< HMX unlocking type.*/ + +/* HMX hardware context */ +#define QURT_HMX_UNIT_0 0U /**< HMX hardware context #0 */ +#define QURT_HMX_UNIT_1 1U /**< HMX hardware context #1 */ + /** @} */ /* end_addtogroup hmx_types */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_hmx_lock2 + Locks a HMX unit with the specified locking type. + + #QURT_HMX_NON_SHARED_LOCK: + - If a HMX unit is available, lock the unit and return success of #QURT_EOK. + - If the HMX unit is already locked by another thread, the caller thread is suspended + until the HMX is available and gets locked by this function. + - If there is no HMX hardware supported, returns #QURT_EVAL; + + #QURT_HMX_SHARED_LOCK: + - If a HMX unit is available, enables HMX access for the caller thread, and returns + success of #QURT_EOK. + - If the HMX is enabled on the caller thread, return #QURT_EFAILED. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller + thread, and return success of #QURT_EOK. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED. + - If the HMX is locked by a thread from another user process different from the + user process of the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX lock successful.\n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_lock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_unlock2 + Unlocks a HMX unit with the unlocking type. + + #QURT_HMX_NON_SHARED_UNLOCK: + - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the + HMX accumulators (assuming a fixed point type). + - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + #QURT_HMX_SHARED_UNLOCK: + - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the + HMX access on the caller thread, and return success of #QURT_EOK. + Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK + in its user process, the unlock function clears the HMX accumulators. + - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return + failure of #QURT_EFAILED. + - If the caller thread has not locked HMX, return failure of #QURT_EFAILED. + - If there is no HMX hardware supported, returns #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX is unlocked successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_unlock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_lock + Locks a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away. + If there is no HMX unit available, the caller is blocked until a HMX is available + and is locked by the function. + + @return + #QURT_EOK -- HMX lock successful. \n + #QURT_EFAILED -- Failure due to wrong locking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_lock(void); + + +/**@ingroup func_qurt_hmx_unlock + Unlocks a HMX unit. + If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its + accumulators(assuming fixed point type). + If there is no HMX unit locked by the caller thread, return failure. + + @return + #QURT_EOK -- HMX unlock successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_unlock(void); + + +/**@ingroup func_qurt_hmx_try_lock + Tries to lock a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away; + if there is no HMX unit available, the function returns failure without blocking the caller. + + @return + #QURT_EOK -- HMX lock successful \n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_try_lock(void); + + +/**@ingroup func_qurt_hmx_assign + Assign a HMX unit to a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, + kernel will detach it from the thread, and re-assign it to the target thread. + If the target thread has HVX enabled, it cannot have HMX enabled. + + Locking type + #QURT_HMX_NON_SHARED_LOCK: + - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK. + - If the HMX unit is already enabled on the target thread, return #QURT_EOK. + - If the HMX unit is already locked by another thread, detach the HMX from the thread. + Re-assign the HMX unit to the target thread, and return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] type Locking type + #QURT_HMX_NON_SHARED_LOCK -- non-shared lock + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is assigned successfully. This includes the case that \n + the target thread already has HMX assigned. \n + #QURT_EFAILED -- Failure due to wrong assigning conditions. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit ); + + +/**@ingroup func_qurt_hmx_release + Release a HMX unit from a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + + Qurt detaches the specified HMX unit from the target thread, and return success of + #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is released successfully. This includes the case that \n + the target thread already has the HMX released. \n + #QURT_EFAILED -- Failure due to wrong assigning condition. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit ); + + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HMX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hvx.h new file mode 100755 index 0000000000000..13c213d49ac84 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hvx.h @@ -0,0 +1,421 @@ +#ifndef QURT_HVX_H +#define QURT_HVX_H +/** + @file qurt_hvx.h + @brief Prototypes of QuRT HVX API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @cond */ + +typedef enum { + QURT_HVX_MODE_64B = 0, /**< HVX mode of 64 bytes */ + QURT_HVX_MODE_128B = 1 /**< HVX mode of 128 bytes */ +} qurt_hvx_mode_t; +/** @endcond */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @cond internal_only*/ +/** @addtogroup hvx_macros +@{ */ +#define QURT_HVX_HW_UNITS_2X128B_4X64B 0x00000204 /**< Bits 15 through 8 are for the number of 128B units. */ + /**< Bits 7 through 0 are for the number of 64B units. */ +#define QURT_HVX_HW_UNITS_4X128B_0X64B 0x00000400 +#define QURT_HVX_HW_UNITS_6X128B_0X64B 0x00000600 + +/* HVX locking status */ + +#define QURT_HVX_UNLOCKED (0) /* Has not locked HVX unit */ +#define QURT_HVX_LOCKED (1) /* Has locked HVX unit */ +#define QURT_HVX_ERROR (-1) /* Error, no HVX support */ + +/* Input value for HVX reservation */ + +#define QURT_HVX_RESERVE_ALL (4) /* All the HVX units in terms of 64B_MODE are requested to be reserved */ +#define QURT_HVX_RESERVE_ALL_AVAILABLE (0xff) /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */ + +/* Return values for HVX reservation */ + +#define QURT_HVX_RESERVE_NOT_SUPPORTED (-1) /* There is no HVX hardware, or less units in the hardware than requested */ +#define QURT_HVX_RESERVE_NOT_SUCCESSFUL (-2) /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */ +#define QURT_HVX_RESERVE_ALREADY_MADE (-3) /* There is already a HVX reservation made. */ +#define QURT_HVX_RESERVE_CANCEL_ERR (-4) /* The action of cancling the reservation fails because this protection domain has no reservation made before. */ + +// HVX set requests + +#define QURT_HVX_64B 0 /**< */ +#define QURT_HVX_128B 1 /**< */ +#define QURT_HVX_NO_USE 2 /**< */ +#define QURT_HVX_RELEASE_CONTEXT 3 /**< */ +#define QURT_HVX_IMMEDIATE_USE 4 /**< */ + +// HVX set masks + +#define QURT_HVX_64B_PREFERRED (1<<(QURT_HVX_64B + 8))/**< */ +#define QURT_HVX_128B_PREFERRED (1<<(QURT_HVX_128B + 8))/**< */ +#define QURT_HVX_64B_ACCEPTABLE (1<<(QURT_HVX_64B + 12))/**< */ +#define QURT_HVX_128B_ACCEPTABLE (1<<(QURT_HVX_128B + 12))/**< */ + +// HVX set return "result" + +#define QURT_EOK 0 /**< */ +#define QURT_HVX_SET_ERROR 0xFF /**< */ + +// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE +#define QURT_HVX_64B_ASSIGNED (1<<(QURT_HVX_64B + 8)) /**< */ +#define QURT_HVX_128B_ASSIGNED (1<<(QURT_HVX_128B + 8)) /**< */ + +// Sizes of HVX dump buffer + +#define QURT_HVX_V65_64B_VSIZE 2084U /**< 64 x 32 + 8 x 4 + 4 (version). */ +#define QURT_HVX_V65_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V66_128B_VSIZE 4420U /**< 128 x (32 +2) + 16 x 4 + 4 (version). */ +#define QURT_HVX_V68_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V79_128B_VSIZE 4740U /**< 128 x (32+4+1) + 4 (version). */ +#define QURT_HVX_VREG_BUF_SIZE QURT_HVX_V79_128B_VSIZE /**< */ + +// HVX dump versions + +#define QURT_HVX_DUMP_V65_64B 1U /**< */ +#define QURT_HVX_DUMP_V65_128B 2U /**< */ +#define QURT_HVX_DUMP_V66_128B 3U /**< */ +#define QURT_HVX_DUMP_V68_128B 4U /**< */ +#define QURT_HVX_DUMP_V79_128B 5U /**< */ +/** @} */ /* end_addtogroup hvx_macros */ +/** @endcond */ +/** @cond */ +// Qurt data struct for hvx_set input +typedef struct qurt_hvx_set_struct_ { + unsigned char set_req; // LSB + struct { + unsigned char preferred_mask:4; + unsigned char acceptable_mask:4; + }; + unsigned short resvd; // MSB +} qurt_hvx_set_struct_t; // 4 bytes + + +// Qurt data struct for hvx_set return +typedef struct qurt_hvx_set_return_str_ { + unsigned char result; // LSB + unsigned char hvx_mode_assigned; + unsigned short resvd; // MSB +} qurt_hvx_set_return_struct_t; // 4 bytes +/** @endcond */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_hvx_lock + Locks one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns right away. + If the current HVX mode is different from the requested mode, the current + thread is blocked. When all HVX units become idle, QuRT changes + the mode, locks the HVX unit, and returns. + + Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is + mapped as qurt_hvx_set(64_BYTE or 128_BYTE). + + @datatypes + #qurt_mode_t + + @param[in] lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B. + + @return + #QURT_EOK -- Success \n + Other value -- Failure + + @dependencies + None. + + */ +int qurt_hvx_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_unlock + Unlocks the HVX unit held by this software thread. + + @note1hang Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock() + maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT). + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_unlock(void); + +/**@ingroup func_qurt_hvx_try_lock + Tries to lock one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns #QURT_EOK; Otherwise, + the function returns a failure, but does not block the current software + thread to wait for the HVX unit. + Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock() + maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask); + + @datatypes + #qurt_mode_t + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_get_mode + Gets the current HVX mode configured by QuRT. + + @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on + the current HVX configuration. + + @param[out] + None. + + @return + #QURT_HVX_MODE_128B \n + #QURT_HVX_MODE_64B \n + -1 -- Not available. + + @dependencies + None. + */ +int qurt_hvx_get_mode(void); + + +/**@ingroup func_qurt_hvx_get_units + Gets the HVX hardware configuration that the chipset supports. + + @note1hang The function returns the HVX hardware configuration supported by the chipset. + + @return + Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n + - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n + - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n + - 0 -- not available + + @dependencies + None. + + */ +int qurt_hvx_get_units(void); + + +/**@ingroup func_qurt_hvx_reserve + Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + If one HVX unit is already locked by the application in the same PD, the unit is + added to the returned count as one reserved unit for the PD. + Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve() + only does basic sanity checks on HVX units. + + @datatypes + None. + + @param[in] num_units Number of HVX units in terms of 64B_MODE to reserve for the PD. + QURT_HVX_RESERVE_ALL to reserve all the HVX units. + QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units. + + @return + Number of units successfully reserved, including the units already locked in the same PD. \n + #QURT_HVX_RESERVE_NOT_SUPPORTED \n + #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n + #QURT_HVX_RESERVE_ALREADY_MADE + + + @dependencies + None. + + */ +int qurt_hvx_reserve(int num_units); + + +/**@ingroup func_qurt_hvx_cancel_reserve + Cancels the HVX reservation in the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + + @return + 0 -- Success \n + #QURT_HVX_RESERVE_CANCEL_ERR -- Failure + + @dependencies + None. + + */ +int qurt_hvx_cancel_reserve(void); + + +/**@ingroup func_qurt_hvx_get_lock_val + Gets the HVX locking status value of the thread of the caller. + + @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not. + + @datatypes + None. + + @return + #QURT_HVX_UNLOCKED \n + #QURT_HVX_LOCKED \n + #QURT_HVX_ERROR + + @dependencies + None. + */ +int qurt_hvx_get_lock_val(void); + +/** @cond internal_only*/ +/**@ingroup func_qurt_hvx_set + Sets the HVX configuration for the software thread of the caller. + + @datatypes + None. + + @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask + | hvx_acceptable_mode_mask where set_request can be set to: \n + - #QURT_HVX_64B \n + - #QURT_HVX_128B \n + - #QURT_HVX_NO_USE \n + - #QURT_HVX_RELEASE_CONTEXT \n + - #QURT_HVX_IMMEDIATE_USE \n + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_preferred_mode_mask can be set to: \n + - #QURT_HVX_64B_PREFERRED \n + - #QURT_HVX_128B_PREFERRED + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_acceptable_mode_mask can be set to: \n + - #QURT_HVX_64B_ACCEPTABLE \n + - #QURT_HVX_128B_ACCEPTABLE @tablebulletend + + @return + Result of the HVX setting in the least significant 8 bits of the returned data. \n + #QURT_EOK -- 0 \n + #QURT_HVX_SET_ERROR -- 0xFF \n + When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, + bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n + - #QURT_HVX_64B_ASSIGNED \n + - #QURT_HVX_128B_ASSIGNED + + @dependencies + None. + */ +unsigned int qurt_hvx_set(unsigned int input_arg); + + +/**@ingroup func_qurt_system_hvx_regs_get_maxsize + Returns the maximum buffer size for saving HVX registers. + + @datatypes + None. + + @return + 0 -- No HVX supported in the target. \n + #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers. + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get_maxsize(void); + + +/**@ingroup func_qurt_system_hvx_regs_get_size + Returns the buffer size for saving HVX registers for a specified thread. + + @param[in] thread_id Thread ID of the target thread. + + @return + 0 -- No HVX assgined to the thread. \n + size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n + - #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + + @dependencies + None. + + */ +unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id); + + + +/**@ingroup func_qurt_system_hvx_regs_get + Saves the HVX registers into the specified buffer. + Returns the size of the data saved into the buffer. + After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer + from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0. + + @param[in] thread_id Thread ID of the target thread. + @param[in] pBuf Pointer to the buffer for HVX register saving. + The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from + the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. + For example, a buffer can be declared at first as: \n + unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n + unsigned char *pBuf; \n + then align the buffer pointer to: \n + pBuf = vbuf; \n + pBuf += (256 - 4 - (unsigned)pBuf%256); + @param[in] size Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that + returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above. + @param[out] pBuf Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith + byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes + contain one of the HVX dump versions:\n + - #QURT_HVX_DUMP_V65_64B \n + - #QURT_HVX_DUMP_V65_128B \n + - #QURT_HVX_DUMP_V66_128B \n + - #QURT_HVX_DUMP_V68_128B \n + - #QURT_HVX_DUMP_V79_128B \n + @tablebulletend + + @return + Total bytes of the data saved in the provided buffer. \n + 0 -- No HVX assigned to the thread \n + #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HVX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_int.h new file mode 100755 index 0000000000000..386aeda1051eb --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_int.h @@ -0,0 +1,509 @@ +#ifndef QURT_INT_H +#define QURT_INT_H +/** + @file qurt_int.h + @brief QuRT interrupt functions. + + + + Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/** @cond rest_reg_dist */ +/** @addtogroup interrupts_constants +@{ */ +#define SIG_INT_ABORT 0x80000000 /**< */ +#define QURT_INT_NON_DELAYED_ACK 0 +#define QURT_INT_DELAYED_ACK 1 +#define QURT_INT_ACK_DEFAULT QURT_INT_NON_DELAYED_ACK +#define QURT_INT_DRV_DEFAULT 0 +#define QURT_INT_PRIORITY_DEFAULT 0xFF + +/** QuRT interrupt property. */ +#define QURT_INT_CONFIGID_POLARITY 0x1U /**< */ +#define QURT_INT_CONFIGID_LOCK 0x2U /**< */ + +/** QuRT interrupt lock.*/ +#define QURT_INT_LOCK_DEFAULT 0x0 /**< Default. */ +#define QURT_INT_LOCK_DISABLE 0x0 /**< Interrupt can be enabled or disabled or deregistered. */ +#define QURT_INT_LOCK_ENABLE 0x1 /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/ +/** @} */ /* end_addtogroup interrupts_constants */ + +/** @addtogroup Qurt_interrupt_type +@{ */ +/** Trigger type bit fields for a PDC interrupt:\n + @verbatim + Polarity Edge Output\n + 0 00 Level sensitive active low + 0 01 Rising edge sensitive + 0 10 Falling edge sensitive + 0 11 Dual edge sensitive + 1 00 Level sensitive active high + 1 01 Falling edge sensitive + 1 10 Rising edge sensitive + 1 11 Dual edge sensitive + @endverbatim +*/ +#define QURT_INT_TRIGGER_TYPE_SET(pol, edge) ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */ + +#define QURT_INT_TRIGGER_LEVEL_LOW QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_LEVEL_HIGH QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_RISING_EDGE QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_FALLING_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_DUAL_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U) /**< */ +#define QURT_INT_TRIGGER_USE_DEFAULT 0xffU /**< */ +/** @} */ /* end_addtogroup Qurt_interrupt_type */ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_interrupt_register + @xreflabel{sec:interrupt_register} + Registers the interrupt.\n + Enables the specified interrupt and associates it with the specified QuRT signal object and + signal mask. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask); + +/**@ingroup func_qurt_interrupt_register2 + @xreflabel{sec:interrupt_register2} + Registers the interrupt.\n + Enables the specified interrupt, associates it with the specified QuRT signal object and + signal mask, and sets interrupt flags. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals that the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value #QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value #QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + #QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + @param[in] flags Defines interrupt property, supported property is interrupt lock enable/disable. + Possible values for flags: \n + - #QURT_INT_LOCK_ENABLE + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags); +/* + * Waits for registered interrupt signal + + * Suspend the current thread until one of its registered interrupts occurs. The second input mask, + * contains the interrupt signals the IST expects to receive. The interrupt signals are registered + * with interrupts via qurt_register_interrupt API. + * + * The signals returned in the signal variable indicate which interrupts occurred. Use function + * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to + * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST + * must quit from interrupt receiving loop. + * + * For detail information on this API, see QuRT User Manual Section 4.2.5 + * + * Prototype + * + * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask) + */ + +/**@ingroup func_qurt_interrupt_acknowledge + Acknowledges an interrupt after it has been processed.\n + Re-enables an interrupt and clears its pending status. This is done after an interrupt is + processed by an IST. + + Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST + performs the acknowledge operation after it has finished processing the interrupt and + just before suspending itself (such as by waiting on the interrupt signal). + + @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt, + an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before + acknowledging the interrupt. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Interrupt acknowledge was successful. \n + #QURT_EDEREGISTERED -- Interrupt is already de-registered. + + @dependencies + None. +*/ +int qurt_interrupt_acknowledge(int int_num); + +/**@ingroup func_qurt_interrupt_deregister + Disables the specified interrupt and disassociates it from a QuRT signal object. + If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation + returns the status value #QURT_EINT. + + @note1hang If an interrupt is deregistered while an IST waits + to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid + this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an + IST after determining that it has no interrupts registered. + + @param[in] int_num L2VIC to deregister; valid range is 0 to 1023. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number (not registered). + + @dependencies + None. + +*/ +unsigned int qurt_interrupt_deregister(int int_num); +/** @endcond */ + +/**@ingroup func_qurt_interrupt_disable + Disables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + After qurt_interrupt_disable() returns, the Hexagon subsystem + can no longer send the corresponding interrupt to the Hexagon + core, until qurt_interrupt_enable() is called + for the same interrupt. + + Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within + a short period of time.\n + - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() + is called. Therefore, some time later, the pending interrupt is received on a Hexagon + hardware thread.\n + - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon + hardware automatically disables the interrupt until kernel software re-enables the interrupt + at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain + thread at an ealier time, the interrupt is re-enabled earlier and can trigger + sending a new interrupt to the Hexagon core while kernel software is still processing + the previous interrupt. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully disabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_disable(int int_num); + + +/**@ingroup func_qurt_interrupt_enable + Enables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully enabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. + +*/ + unsigned int qurt_interrupt_enable(int int_num); + + +/**@ingroup func_qurt_interrupt_status + Returns a value that indicates the pending status of the specified interrupt. + + @param[in] int_num Interrupt number that is being checked. + @param[out] status Interrupt status; 1 indicates that an interrupt is + pending, 0 indicates that an interrupt is not pending. + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_status(int int_num, int *status); + + +/**@ingroup func_qurt_interrupt_get_status + Gets the status of the specified interrupt in L2VIC. + + @param[in] int_num Interrupt number that is being checked. + @param[in] status_type 0 -- interrupt pending status \n + 1 -- interrupt enabling status + @param[out] status 0 -- OFF \n + 1 -- ON + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status); + +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_interrupt_clear + Clears the pending status of the specified interrupt. + + @note1hang This operation is intended for system-level use, and must be used with care. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_clear(int int_num); + + +/**@ingroup func_qurt_interrupt_get_config + Gets the L2VIC interrupt configuration. \n + This function returns the type and polarity of the specified L2VIC interrupt. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[out] int_type Pointer to an interrupt type. \n + 0 -- Level-triggered interrupt \n + 1 -- Eedge-triggered interrupt + @param[out] int_polarity Pointer to interrupt polarity.\n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt. + + @return + #QURT_EOK -- Configuration successfully returned.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity); + +/**@ingroup func_qurt_interrupt_set_config + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang Deregister L2VIC interrupts before reconfiguring them. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Interrupt type. \n + 0 -- Level-triggered interrupt\n + 1 -- Edge-triggered interrupt + @param[in] int_polarity Interrupt polarity. \n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity); + +/**@ingroup func_qurt_interrupt_set_config2 + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Notified to the hardware configuration callback function and used to + modify the L2VIC type. Possible values: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type); + +/**@ingroup func_ qurt_interrupt_set_config3 + Sets the specified configuration value for the specified property of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity. + + @param[in] int_num L2VIC interrupt to re-enable. + @param[in] config_id Property to configure: \n + - #QURT_INT_CONFIGID_POLARITY \n + - #QURT_INT_CONFIGID_LOCK @tablebulletend + @param[in] config_val Dependent on the second argument config_id, specifies the value to set. \n + Values for #QURT_INT_CONFIGID_POLARITY: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE \n + + Values for #QURT_INT_CONFIGID_LOCK: \n + - #QURT_INT_LOCK_ENABLE\n + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. +*/ +unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val); + + +/**@ingroup func_qurt_interrupt_raise + Raises the interrupt. \n + This function triggers a level-triggered L2VIC + interrupt, and accepts interrupt numbers in the range of 0 to 1023. + + @param[in] interrupt_num Interrupt number. + + @return + #QURT_EOK -- Success \n + -1 -- Failure; the interrupt is not supported. + + @dependencies + None. + */ +int qurt_interrupt_raise(unsigned int interrupt_num); + +/**@ingroup func_qurt_interrupt_raise2 + Raises the interrupt and returns the current pcycle value. + + @param[in] interrupt_num Interrupt number. + + @return + 0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n + Other value -- pcycle count at the time the interrupt is raised. + + @dependencies + None. + */ +unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_isr_subcall + Indicates whether the current function is called from a callback procedure (either short or long). + + @return + #QURT_EOK -- TRUE \n + #QURT_EVAL -- FALSE. + + @dependencies + None. + */ +int qurt_isr_subcall(void); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_INT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_island.h new file mode 100755 index 0000000000000..f0c8ee27cf8b0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_island.h @@ -0,0 +1,122 @@ +#ifndef QURT_ISLAND_H +#define QURT_ISLAND_H + +/** + @file qurt_island.h + @brief Prototypes of power API + The APIs allow entering and exiting island mode where the memory + accesses are limited to local memory. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_island_get_status + Gets Island mode status. + + Returns a value that indicates whether the QuRT system executes in Island mode. + + @return + 0 - Normal mode. \n + 1 - Island mode. + + @dependencies + None. +*/ +unsigned int qurt_island_get_status (void); + +/**@ingroup func_qurt_island_get_status2 + Gets Island mode status especially that differentiates between island partial exit and complete exit. + + Returns a value that indicates the current state. + + @note1hang Transition from NORMAL mode to ISLAND mode happens in single + threaded mode. Whereas transition from ISLAND mode to other modes + happen in multi-threaded mode. So, a thread that gets island mode + status as NORMAL can assume the same status till it continues to + run. A thread that gets island mode status as ISLAND should + assume that the status may change to EXITING or NORMAL while it + runs. A thread that gets island mode status as EXITING should + assume that the status may change to NORMAL while it runs. If + the thread goes to wait state in after reading the status, it should get + the island mode state again and not assume the previous state. + @note2hang This api returns more intrinsic states than qurt_island_get_status, + when qurt_island_get_status returns 0, this api could return + QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND + + @param[in/out] data field is reserved for future use. If NULL pointer is passed, + the field will be ignored. If a valid pointer is passed, + QuRT will return back a bitmask which can be interpreted as follows: + data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. + Otherwise set to 0. + data[30:0] – Reserved for future definition. + + @return + QURT_ISLAND_MODE_NORMAL - Main mode \n + QURT_ISLAND_MODE_ISLAND - Island mode \n + QURT_ISLAND_MODE_EXITING - Exiting Island mode \n + + @dependencies + None. +*/ +unsigned int qurt_island_get_status2 (unsigned int *data); + + + +/**@ingroup func_qurt_island_get_exit_status + Gets the reason for the last Island mode exit status. + + @param[out] cause_code Pointer that returns the cause code of the last + island exit reason. \n + - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n + - #QURT_ENOISLANDENTRY -- API called before exiting island. \n + - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend + + @param[out] int_num Pointer that holds the invalid interrupt number that caused + island exit when the cause code is #QURT_EISLANDINVALIDINT. + For other cases, it is -1. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num); + +/**@ingroup func_qurt_island_get_enter_timestamp + Gets the recent timestamp when the system exits STM during island enter. + + @param[out] island_enter_timestamp Returns a pointer to the recent timestamp + recorded after the system exits STM during island enter. If the system never + attempts to enter island, the island_enter_timestamp return pointer holds a value + of zero. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISLAND_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_isr.h new file mode 100755 index 0000000000000..db29ea2f265d7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_isr.h @@ -0,0 +1,177 @@ +#ifndef QURT_ISR_H +#define QURT_ISR_H + +/*===================================================================== + + @file qurt_isr.h + + @brief Prototypes of Qurt ISR API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2017, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + Functions +=============================================================================*/ + + +/**@ingroup func_qurt_isr_set_hw_config_callback + Set callback function for the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_config_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_enable_callback + Set callback function for enabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_enable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_disable_callback + Set callback function for disabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_disable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_create + Creates an ISR thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + + @return + #QURT_EVAL -- Invalid arguments + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr); + +/**@ingroup func_qurt_isr_register2 + Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes. + The interrupt is enabled when this function returns success. + + @datatypes + qurt_thread_t + + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create() + @param[in] int_num The interrupt number + @param[in] prio Priority of the ISR + @param[in] flags Defines ACK type. Values : \n + QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the Kernel. + QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + @param[in] int_type. Notifies it to registered function. Values: \n + - QURT_INT_TRIGGER_USE_DEFAULT + - QURT_INT_TRIGGER_LEVEL_HIGH + - QURT_INT_TRIGGER_LEVEL_LOW + - QURT_INT_TRIGGER_RISING_EDGE + - QURT_INT_TRIGGER_FALLING_EDGE + - QURT_INT_TRIGGER_DUAL_EDGE + @param[in] isr Interrupt Service Routine with proto type void isr (void *arg, int int_num) + @param[in] arg 1st argument of the ISR when it is called to service the interrupt + + @return + QURT_EOK -- Successfully registered the ISR for the interrupt + QURT_EINT -- Interrupt not configured + QURT_EINVALID -- Invalid Thread ID + QURT_EDISABLED -- The feature is disabled + QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_isr_deregister2 + De-registers the ISR for the specified interrupt. + The interrupt is disabled when this function returns success. + + @param[in] int_num The interrupt number + + @return + QURT_EOK -- ISR deregistered successfully + QURT_ENOREGISTERED -- Interrupt with int_num is not registered + + @dependencies + None. + */ +int qurt_isr_deregister2 (int int_num); + +/**@ingroup func_qurt_isr_delete + ISR thread will exit and releases Kernel resources + + @note1hang The ISR thread shouldn't be actively processing interrupts, + otherwise the call will fail and return an error. + + @param[in] thread-id of the ISR thread that needs to be deleted. + + @return + QURT_ENOTALLOWED -- ISR thread is processing an interrupt + QURT_EINVALID -- Invalid ISR thread ID + QURT_EOK -- Success + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_delete (qurt_thread_t isr_tid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISR_H */ + + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_l2cfg.h new file mode 100755 index 0000000000000..7e26b30a580d9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_l2cfg.h @@ -0,0 +1,98 @@ +#ifndef QURT_L2CFG_H +#define QURT_L2CFG_H +/** + @file qurt_l2cfg.h + @brief QuRT APIs for L2 configuration and system configuration + +EXTERNAL FUNCTIONS + qurt_l2cfg_set + qurt_l2cfg_get + qurt_system_config_get + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +/* Definition for system configuration */ +/** @addtogroup l2cfg_macros +@{ */ +#define QURT_CORE_CFG_HMX_INT8_SPATIAL 0x78 /**< HMX fixed-point spatial size */ +#define QURT_CORE_CFG_HMX_INT8_DEPTH 0x7C /**< HMX fixed-point output depth */ +/** @} */ /* end_addtogroup l2cfg_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_l2cfg_set + Sets the value of a L2 configuration register. A register can be set *IFF* its + initial value is configured. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[in] value Value to set the register to. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely + a configuration problem. \n + #QURT_EINVALID -- Argument error. \n + #QURT_ENOTALLOWED -- Setting this register is prohibited. + + @dependencies + None. + */ +int qurt_l2cfg_set (unsigned short offset, unsigned int value); + +/**@ingroup func_qurt_l2cfg_get + Gets the value of a L2 configuration register. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[out] value Pointer to value of the register. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; + likely a configuration problem. \n + #QURT_EINVALID -- Argument error. + + @dependencies + None. + + */ +int qurt_l2cfg_get (unsigned short offset, unsigned int * value); + + +/**@ingroup func_qurt_system_config_get + Gets the system configuration information. + + @param[in] index Index to system configuration. Values:\n + - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n + - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend + + @param[out] data Pointer to a word for returned data. + + @return + #QURT_EOK -- Get the configuration data successful. \n + Other values -- Failure (no such configuration available). + + @dependencies + None. + + */ +int qurt_system_config_get(int index, unsigned int *data); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_L2CFG_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_lifo.h new file mode 100755 index 0000000000000..dc399fccc5f0f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_lifo.h @@ -0,0 +1,71 @@ +#ifndef QURT_LIFO_H +#define QURT_LIFO_H +/** + @file qurt_lifo.h + + @brief + Provide lock free LastInFirstOut algorithm, which can be used in a + variety of situations for allocation/free fixed size buffer + This implementation touches the first word of your FREED buffer. Even + though it does not matter how you use it when it is allocated, you might want + to be a bit careful not to put your MAGIC number as the first field. + Because it will not hold the magic value for "freed" + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ +/** + Pops an element out of the LIFO. + + @param[in] freelist Pointer to the head of your list. + + @return + Top object from the list + + @dependencies + None. +*/ +/* ======================================================================*/ +void * qurt_lifo_pop(void *freelist); + + +/*======================================================================*/ +/** + Pushes an element into the LIFO. + + @param[in] freelist Pointer to the head of your list. + @param[in] buf Pointer to your buffer to push into the list. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_lifo_push(void *freelist, void *buf); + +void qurt_lifo_remove(void *freelist, void *buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_LIFO_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mailbox.h new file mode 100755 index 0000000000000..a6cd91c611782 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mailbox.h @@ -0,0 +1,176 @@ +#ifndef QURT_MAILBOX_H +#define QURT_MAILBOX_H + +/** + @file qurt_mailbox.h + @brief Definitions, macros, and prototypes used for QuRT mailbox + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* Definitions on typedef and return values */ + +#define QURT_MAILBOX_ID_NULL 0 +#define QURT_MAILBOX_ERROR -1 +#define QURT_MAILBOX_ID_ERROR -2 +#define QURT_MAILBOX_NON_VALID_DATA -3 +#define QURT_MAILBOX_FULL -4 +#define QURT_MAILBOX_DELETED -5 +#define QURT_MAILBOX_RECEIVE_HALTED -6 +#define QURT_MAILBOX_BANDWIDTH_LIMIT -7 + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ + +#define QURT_MAILBOX_AT_QURTOS 0U // Receiver is QurtOS +#define QURT_MAILBOX_AT_ROOTPD 1U // Receiver is RootPD (ASID=0) +#define QURT_MAILBOX_AT_USERPD 2U // Receiver is User PD (ASID!=0) +#define QURT_MAILBOX_AT_SECUREPD 3U // Receiver is Secure PD + +typedef unsigned char qurt_mailbox_receiver_cfg_t; + +#define QURT_MAILBOX_SEND_OVERWRITE 0U // When there is already valid content, overwrite it +#define QURT_MAILBOX_SEND_NON_OVERWRITE 1U // When there is already valid content, return failure + +typedef unsigned char qurt_mailbox_send_option_t; + + +#define QURT_MAILBOX_RECV_WAITING 0U // When there is no valid content, wait for it +#define QURT_MAILBOX_RECV_NON_WAITING 1U // When there is no valid content, return failure immediately +#define QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U // Read the content, but doesn't remove it from the mailbox. No waiting. + +typedef unsigned char qurt_mailbox_recv_option_t; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/* Function prototype */ + +/**@ingroup qurt_mailbox_create + Creates a QuRT mailbox. + + @param name Mailbox name up to 8 characters. + @param recv_opt Configuration on the receiver process. + + @return + Mailbox ID -- Mailbox Identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at creating mailbox + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt); + + +/**@ingroup qurt_mailbox_get_id + Gets a QuRT mailbox identifier. + + @param name Mailbox name up to 8 characters. + + @return + Mailbox ID -- Mailbox identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_get_id(char *name); + + +/**@ingroup qurt_mailbox_send + Sends data to a QuRT mailbox. + + @param mailbox_id Mailbox identifier. + @param send_opt Option for mailbox send. + @param data Data to send. + + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors.\n + #QURT_MAILBOX_FULL Valid data already exists, non-overwriting.\n + #QURT_MAILBOX_BANDWIDTH_LIMIT Reached the bandwidth limitation. + + @dependencies + None. +*/ +int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data); + + +/**@ingroup qurt_mailbox_receive + Receive data from QuRT mailbox + + @param mailbox_id Mailbox Identifier + @param send_opt Option for mailbox receiving + @param data Pointer to data buffer for receiving + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. \n + #QURT_MAILBOX_NON_VALID_DATA No current valid data, put the previous content in the buffer. \n + #QURT_MAILBOX_RECEIVE_HALTED Receive halted, the waiting thread is woken up. \n + #QURT_MAILBOX_DELETED Mailbox is deleted, and the waiting thread is woken up. + + @dependencies + None. +*/ +int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data); + + +/**@ingroup qurt_mailbox_delete + Deletes a QuRT mailbox. + + A mailbox can only be deleted from the process that created the mailbox. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_delete(unsigned long long mailbox_id); + + +/**@ingroup qurt_mailbox_receive_halt + Halts a QuRT mailbox receiving and wakes up waiting threads. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_receive_halt(unsigned long long mailbox_id); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_MAILBOX_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_memory.h new file mode 100755 index 0000000000000..90ce2586fec50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_memory.h @@ -0,0 +1,1487 @@ +#ifndef QURT_MEMORY_H +#define QURT_MEMORY_H +/** + @file qurt_memory.h + @brief Prototypes of kernel memory API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include +//#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup memory_management_macros +@{ */ +#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all + processes.*/ +/** @} */ /* end_addtogroup memory_management_macros */ +/**@cond rest_reg_dist */ +/** @addtogroup memory_management_types +@{ */ +/** @xreflabel{hdr:qurt_mem_default_pool} */ +extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/ +/** @} */ /* end_addtogroup memory_management_types */ + +/** @cond rest_reg_dist */ +/** Mapping attribute information*/ +typedef struct{ + qurt_paddr_64_t paddr; + qurt_size_t size ; + qurt_mem_cache_mode_t cache_mode; + qurt_perm_t perms ; +}qurt_mapping_attr_t; +/** @endcond */ +/** @} */ /* end_addtogroup mapping_attribute_types*/ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_mem_cache_clean + Performs a cache clean operation on the data stored in the specified memory area. + Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater. + + @note1hang Perform the flush all operation only on the data cache. + + @note1cont This operation flushes and invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed and invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_ALL\n + @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend + @param[in] type Cache type. Values: + - #QURT_MEM_ICACHE + - #QURT_MEM_DCACHE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type.\n + + @dependencies + None. +*/ +int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_clean2 + Performs a data cache clean operation on the data stored in the specified memory area. + + This API only performs the following data cache operations:\n + - #QURT_MEM_CACHE_FLUSH\n + - #QURT_MEM_CACHE_INVALIDATE\n + - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed/invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values:\n #QURT_MEM_CACHE_FLUSH\n #QURT_MEM_CACHE_INVALIDATE\n + #QURT_MEM_CACHE_FLUSH_INVALIDATE + @param[in] type Cache type. Values: \n #QURT_MEM_DCACHE + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type. + + @dependencies + None. +*/ +int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_phys_clean + Performs a cache clean operation on the data stored in the specified memory area based on address match and mask. + Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch. + + @note1hang The addrmatch value should be the upper 24-bit physical address to match against. + + @datatypes + #qurt_mem_cache_op_t \n + + @param[in] mask 24-bit address mask. + @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid operation + + @dependencies + None. +*/ + +int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode); + +/**@ingroup func_qurt_mem_l2cache_line_lock + Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory. + + @note1hang Perform the line lock operation only on the 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to lock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success.\n + #QURT_EALIGN -- Data alignment or address failure. + #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size) + #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address + in the range of addr and addr+size or the address range is not L2 cacheable + @dependencies + None. +*/ +int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_l2cache_line_unlock + Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory. + + @note1hang Perform the line unlock operation only on a 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to unlock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success. \n + #QURT_EALIGN -- Aligning data or address failure. \n + #QURT_EFAILED -- Operation failed, cannot find the matching tag. + + @dependencies + None. +*/ +int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_region_attr_init + @xreflabel{sec:qurt_mem_region_attr_init} + Initializes the specified memory region attribute structure with default attribute values: \n + - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n + - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n + - Physical address -- -1 \n + - Virtual address -- -1 \n + - Memory type -- #QURT_MEM_REGION_LOCAL \n + - Size -- -1 + + @note1hang The memory physical address attribute must be explicitly set by calling the + qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly + as parameters in the memory region create operation. + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the destination structure for the memory region attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attach + Initializes a memory pool object to attach to a pool predefined in the system + configuration file. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. They are specified in memory region create operations + (Section @xref{sec:mem_region_create}). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach + operation is necessary only when allocating memory regions in nonstandard + memory units such as TCM. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_attach2 + Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL. + The client_handle is used to look up the client specific pool. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. Memory pool objects are specified during mapping creation operations + (qurt_mem_mmap() and qurt_mem_region_create()). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2 + operation is necessary only when allocating memory regions in memory units specific to the client. + + @datatypes + #qurt_mem_pool_t + + @param[in] client_handle Client identifier used by the OS to lookup the identifier + for client specific pool + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_create + @xreflabel{hdr:qurt_mem_pool_create} + Dynamically creates a memory pool object from a physical address range. + + The pool is assigned a single memory region with the specified base address and size. + + The base address and size values passed to this function must be aligned to 4K byte + boundaries, and must be expressed as the actual base address and size values divided by 4K. + + For example, the function call: + @code + qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool) + @endcode + ... is equivalent to the following static pool definition in the QuRT system configuration file: + @code + + + + @endcode + + @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond + + @note1hang Dynamically created pools are not identical to static pools. In particular, + qurt_mem_pool_attr_get() is not valid with dynamically created pools. + + @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[in] base Base address of the memory region (divided by 4K). + @param[in] size Size (in bytes) of the memory region (divided by 4K). + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_add_pages + Adds a physical address range to the specified memory pool object.\n + + @note1hang Call this operation only with root privileges (guest OS mode). + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_mem_pool_add_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages); + +/**@ingroup func_qurt_mem_pool_remove_pages + Removes a physical address range from the specified memory pool object. + + If any part of the address range is in use, this operation returns an + error without changing the state. + + @note1hang Call this operation only with root privileges (guest-OS mode). + + @note1cont In the future, this operation will support (via the flags parameter) the + removal of a physical address range when part of the range is in use. + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + @param[in] flags Remove options. Values: \n + - 0 -- Skip holes in the range that are not part of the pool (default) \n + - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified + physical address range is entirely contained (with no holes) in the + pool free space. @tablebulletend + @param[in] callback Callback procedure called when pages were successfully removed. + Not called if the operation failed. Passing 0 as the parameter + value causes the callback to not be called. + @param[in] arg Value passed as an argument to the callback procedure. + + @return + #QURT_EOK -- Pages successfully removed. + + @dependencies + None. +*/ +int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages, + unsigned flags, + void (*callback)(void *), + void *arg); +/**@ingroup memory_management_types*/ +#define QURT_POOL_REMOVE_ALL_OR_NONE 1 /**< */ + +/**@ingroup func_qurt_mem_pool_attr_get + Gets the memory pool attributes. \n + Retrieves pool configurations based on the pool handle, and fills in + the attribute structure with configuration values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_attr_t + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[out] attr Pointer to the memory region attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attr_get_size + Gets the size of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] size Pointer to the destination variable for the range size. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*size) = 0; + return QURT_EINVALID; + } + else { + (*size) = attr->ranges[range_id].size; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr + Gets the start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; + } + else { + (*addr) = (attr->ranges[range_id].start)<<12; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr_64 + Gets the 64 bit start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_64_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){ +if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; +} +else { + (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12; + } + return QURT_EOK; + } + + +/**@ingroup func_qurt_mem_pool_status_get + Gets the memory pool status. \n + Based on the pool handle, retrieves largest contiguous free memory, + total free memory, and total memory declared for the pool in bytes. Fills in + the memory status structure with the values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_status_t + + @param[in] pool Pool handle. + @param[out] status Pointer to the memory pool status structure. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status); + + +/**@ingroup func_qurt_mem_pool_is_available + Checks whether the number of pages that the page_count argument indicates + can be allocated from the specified pool. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_mem_mapping_t \n + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[in] page_count Number of 4K pages. + @param[in] mapping_type Variable of type qurt_mem_mapping_t. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Mapping_type is invalid. \n + #QURT_EMEM -- Specified pages cannot be allocated from the pool. + + @dependencies + None. +*/ +int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type); + + +/**@ingroup func_qurt_mem_region_create + @xreflabel{sec:mem_region_create} + Creates a memory region with the specified attributes. + + The application initializes the memory region attribute structure with + qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr(). + + If the virtual address attribute is set to its default value + (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is + automatically assigned any available virtual address value. + + If the memory mapping attribute is set to virtual mapping, the physical address of the memory region + is also automatically assigned.\n + + @note1hang The physical address attribute is explicitly set in the attribute structure only + for memory regions with physical-contiguous-mapped mapping. + + Memory regions are always assigned to memory pools. The pool value specifies the memory pool + that the memory region is assigned to. + + @note1hang If attr is specified as NULL, the memory region is created with default + attribute values (Section @xref{sec:qurt_mem_region_attr_init}). + QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory. + + @datatypes + #qurt_mem_region_t \n + #qurt_size_t \n + #qurt_mem_pool_t \n + #qurt_mem_region_attr_t + + @param[out] region Pointer to the memory region object. + @param[in] size Memory region size (in bytes). If size is not an integral multiple of 4K, + it is rounded up to a 4K boundary. + @param[in] pool Memory pool of the region. + @param[in] attr Pointer to the memory region attribute structure. + + @return + #QURT_EOK -- Memory region successfully created.\n + #QURT_EMEM -- Not enough memory to create region. + #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute. + + @dependencies + None. +*/ +int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_region_delete + Deletes the specified memory region. + + If the caller application creates the memory region, it is removed and the system reclaims its + assigned memory. + + If a different application creates the memory region (and is shared with the caller + application), only the local memory mapping to the region is removed; the system does + not reclaim the memory. + + @datatypes + #qurt_mem_region_t + + @param[in] region Memory region object. + + @returns + #QURT_EOK -- Region successfully deleted. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. +*/ +int qurt_mem_region_delete(qurt_mem_region_t region); + + +/**@ingroup func_qurt_mem_region_attr_get + @xreflabel{sec:mem_region_attr_get} + Gets the memory attributes of the specified message region. + After a memory region is created, its attributes cannot be changed. + + @datatypes + #qurt_mem_region_t \n + #qurt_mem_region_attr_t + + @param[in] region Memory region object. + @param[out] attr Pointer to the destination structure for memory region attributes. + + @return + #QURT_EOK -- Operation successfully performed. \n + Error code -- Failure. + + @dependencies + None. +*/ +int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr); + + +/**@ingroup func_qurt_mem_region_attr_set_type + Sets the memory type in the specified memory region attribute structure. + + The type indicates whether the memory region is local to an application or shared between + applications. + @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in,out] attr Pointer to memory region attribute structure. + @param[in] type Memory type. Values: \n + - #QURT_MEM_REGION_LOCAL \n + - #QURT_MEM_REGION_SHARED @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){ + attr->type = type; +} + +/**@ingroup func_qurt_mem_region_attr_get_size + Gets the memory region size from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] size Pointer to the destination variable for memory region size. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){ + (*size) = attr->size; +} + +/**@ingroup func_qurt_mem_region_attr_get_type + Gets the memory type from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] type Pointer to the destination variable for the memory type. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){ + (*type) = attr->type; +} + +/**@ingroup func_qurt_mem_region_attr_set_physaddr + Sets the memory region 32-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise QuRT automatically sets it + when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){ + attr->ppn = (unsigned)(((unsigned)(addr))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr + Gets the memory region physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for memory region physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_region_attr_set_virtaddr + Sets the memory region virtual address in the specified memory attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_addr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){ + attr->virtaddr = addr; +} + +/**@ingroup func_qurt_mem_region_attr_get_virtaddr + Gets the memory region virtual address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for the memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned int)(attr->virtaddr); +} + +/**@ingroup func_qurt_mem_region_attr_set_mapping + Sets the memory mapping in the specified memory region attribute structure. + + The mapping value indicates how the memory region is mapped in virtual memory. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mapping Mapping. Values: + - #QURT_MEM_MAPPING_VIRTUAL + - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS + - #QURT_MEM_MAPPING_IDEMPOTENT + - #QURT_MEM_MAPPING_VIRTUAL_FIXED + - #QURT_MEM_MAPPING_NONE + - #QURT_MEM_MAPPING_VIRTUAL_RANDOM + - #QURT_MEM_MAPPING_INVALID @tablebulletend + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){ + attr->mapping_type = mapping; +} + +/**@ingroup func_qurt_mem_region_attr_get_mapping + Gets the memory mapping from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mapping Pointer to the destination variable for memory mapping. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){ + (*mapping) = attr->mapping_type; +} + +/**@ingroup func_qurt_mem_region_attr_set_cache_mode + Sets the cache operation mode in the specified memory region attribute structure. + + @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mode Cache mode. Values: \n + - #QURT_MEM_CACHE_WRITEBACK \n + - #QURT_MEM_CACHE_WRITETHROUGH\n + - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n + - #QURT_MEM_CACHE_NONE @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){ + QURT_PGATTR_C_SET(attr->pga, (unsigned)mode); +} + +/**@ingroup func_qurt_mem_region_attr_get_cache_mode + Gets the cache operation mode from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){ + unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga); + (*mode) = (qurt_mem_cache_mode_t)mode_temp; +} + +/**@ingroup func_qurt_mem_region_attr_set_bus_attr + Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure. + + @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] abits The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){ + QURT_PGATTR_A_SET(attr->pga, abits); +} + +/**@ingroup func_qurt_mem_region_attr_get_bus_attr + Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] pbits Pointer to an unsigned integer that is filled in with + the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){ + (*pbits) = QURT_PGATTR_A_GET(attr->pga); +} + +void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle); +void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle); +void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms); +void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms); + +/**@ingroup func_qurt_mem_map_static_query + Determines whether a memory page is statically mapped. + Pages are specified by the following attributes: physical address, page size, cache mode, + and memory permissions. \n + - If the specified page is statically mapped, vaddr returns the virtual + address of the page. \n + - If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + The system configuration file defines QuRT memory maps. + + @datatypes + #qurt_addr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr Physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n + #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + + +/**@ingroup func_qurt_mem_region_query + Queries a memory region. \n + This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. + When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_paddr_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr Physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Query successfully performed. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr); + + +/**@ingroup func_qurt_mapping_create + @xreflabel{hdr:qurt_mapping_create} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Mapping created. \n + #QURT_EMEM -- Failed to create mapping. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove + @xreflabel{hdr:qurt_mapping_remove} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Mapping created. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr + Translates a virtual memory address to the physical memory address to which it maps. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the + physical address of another process. + + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- Physical address to which the virtual address is mapped.\n + 0 -- Virtual address not mapped. + + @dependencies + None. +*/ +qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr); + +/**@ingroup func_qurt_mem_region_attr_set_physaddr_64 + Sets the memory region 64-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise it is automatically set by + QuRT when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr_64 Memory region 64-bit physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){ + attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr_64 + Gets the memory region 64-bit physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr_64 Pointer to the destination variable for the memory region 64-bit physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){ + (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_map_static_query_64 + Determines if a memory page is statically mapped. + The following attributes specify pages: 64-bit physical address, page size, cache mode, + and memory permissions. \n + If the specified page is statically mapped, vaddr returns the virtual + address of the page. + If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + QuRT memory maps are defined in the system configuration file. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr_64 64-bit physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n + #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mem_region_query_64 + Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr_64 64-bit physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64); + +/**@ingroup func_qurt_mapping_create_64 + @xreflabel{hdr:qurt_mapping_create_64} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Failure. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove_64 + @xreflabel{hdr:qurt_mapping_remove_64} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Success. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr_64 + Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical + address of another process. + + @datatypes + #qurt_paddr_64_t \n + #qurt_addr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address has not been mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_mapping_reclaim + Deallocates all QuRT resources associated with the specified virtual + memory area, making it available for user memory management:\n + - The associated physical memory areas are freed and added to the + specified physical pool.\n + - The associated TLB entries are deleted and made available for TLB + management.\n + - The virtual memory area is not freed -- it is left in + place as allocated, but unmapped virtual memory. Access to this + memory area generates an exception.\n + + The virtual memory area must be statically allocated. + If no pool is specified, the freed physical memory is not added to any pool. + + @note1hang The virtual memory area is restricted to being filled with locked + TLB entries that are contiguous within the memory area, and contained by it. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_pool_t + + @param[in] vaddr Virtual address of the memory area to free. + @param[in] vsize Size (in bytes) of the memory area to free. + @param[in] pool Handle to the physical pool where freed physical memory is added. + If set to 0, freed physical memory is not added to any pool. + + @return + 0 -- Success. \n + Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that + QuRT logs messages related to the failure, and callers are free to ignore the return value. + + @dependencies + None. +*/ +int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_mem_configure_cache_partition + Configures the Hexagon cache partition at the system level. + + A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache. + + The L1 cache partition is not supported in Hexagon processor version V60 or greater. + + @note1hang Call this operation only with QuRT OS privilege. + + @datatypes + #qurt_cache_type_t \n + #qurt_cache_partition_size_t + + @param[in] cache_type Cache type for partition configuration. Values: \n + - #HEXAGON_L1_I_CACHE \n + - #HEXAGON_L1_D_CACHE \n + - #HEXAGON_L2_CACHE @tablebulletend + + @param[in] partition_size Cache partition size. Values: \n + - #FULL_SIZE \n + - #HALF_SIZE \n + - #THREE_QUARTER_SIZE \n + - #SEVEN_EIGHTHS_SIZE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Error. + + @dependencies + None. + */ +int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size); + + +/**@ingroup func_qurt_mem_syncht + @xreflabel{hdr:qurt_mem_syncht} + Performs heavy-weight synchronization of memory transactions. + + This operation does not return until all previous memory transactions (cached and uncached load/store, + mem_locked, and so on) that originated from the current thread are complete and globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_syncht(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" SYNCHT \n"); + #endif +} + +/**@ingroup func_qurt_mem_barrier + @xreflabel{hdr:qurt_mem_barrier} + Creates a barrier for memory transactions. + + This operation ensures that all previous memory transactions are globally observable before any + future memory transactions are globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction. + @return + None + + @dependencies + None. + */ +static inline void qurt_mem_barrier(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" BARRIER \n"); + #endif +} +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_system_mem_alloc + Requests that the kernel allocates memory from the kernel-owned pool. + + @param[in] size Size in bytes (aligned to 4K) to allocate. + @param[in] align Any alignment that must be considered for the allocation. + @param[in] flags Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates + available virtual memory in the address space of all processes. + + @return + #QURT_EFATAL -- Allocation failed \n + Start address of the successful allocation. + + @dependencies + None. +*/ +unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags); +/** @endcond */ +/** @cond rest_reg_dist*/ +/**@ingroup func_qurt_lookup_physaddr2 + Translates the virtual memory address of the specified process to the 64-bit + physical memory address to which it is mapped. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[in] vaddr Virtual address. + @param[in] pid PID. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address is not mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid); +/** @endcond */ + +/**@ingroup func_qurt_mapping_attr_get + Gets the mapping attributes for a given virtual address and PID + + @datatypes + #qurt_addr_t \n + #qurt_mapping_attr_t + + @param[in] vaddr virtual address for which the attributes are required. + @param[in] pid process id for the target process + @param[out] attr Pointer to the mapping attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Incorrect virtual address or pid +*/ +int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr); + + +/**@ingroup func_qurt_mapping_attr_get_cache_mode + Gets the cache operation mode in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] cache_mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode) +{ + (*cache_mode) = attr->cache_mode; +} + +/**@ingroup func_qurt_mapping_attr_get_physaddr + Gets the physical memory address in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] physaddr Pointer to the destination variable for physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr) +{ + (*physaddr) = attr->paddr; +} + +/**@ingroup func_qurt_mapping_attr_get_perms + Gets the permissions in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_perm_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] perms Pointer to the destination variable for permissions. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms) +{ + (*perms) = attr->perms; +} + +/**@ingroup func_qurt_mapping_attr_get_size + Gets the size in the specified memory mapping attribute structure.This represents size of the + TLB entry which covers the virtual address. + + + @datatypes + #qurt_mapping_attr_t \n + #unsigned int + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] size Pointer to the destination variable for size. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size) +{ + (*size) = attr->size; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MEMORY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mmap.h new file mode 100755 index 0000000000000..c3bd875910af7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mmap.h @@ -0,0 +1,359 @@ +#ifndef QURT_MMAP_H +#define QURT_MMAP_H +/** + @file qurt_mmap.h + @brief Prototypes of memory mapping/unmapping APIs. + The APIs allow the user to map, un-map, and change permissions + on memory regions. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_mem_mmap + Creates a memory mapping with the specified attributes. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that specifies a pool handle + if the user wants to allocate memory from a specific pool. + The default value for this argument is NULL. + @param[in] pRegion Map region. This argument is unused, and the default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + @param[in] flags Mapping modes.\n + - #QURT_MAP_NAMED_MEMSECTION + - #QURT_MAP_FIXED \n + - #QURT_MAP_NONPROCESS_VPOOL \n + - #QURT_MAP_TRYFIXED \n + - #QURT_MAP_ANON \n + - #QURT_MAP_PHYSADDR \n + - #QURT_MAP_VA_ONLY @tablebulletend + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap2 + Creates a memory mapping with the specified attributes. Returns a more descriptive + error code in case of failure. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that allows the user to specify a pool handle + when the user wants to allocate memory from a specific pool. + Default value for this argument is NULL. + @param[in] pRegion Map region (unused argument); default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode. + @param[in] flags Mapping modes; + Shared, Private, or Anonymous. + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_EMEM -- Physical address is not available. \n + #QURT_EFAILED -- VA is not available or mapping failed.\n + #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA). + */ +void *qurt_mem_mmap2(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap_by_name + Creates a memory mapping for a named-memsection using the specified attributes. + The named memsection should be specified in cust_config.xml. + + @note1hang If the specified attributes are not valid or the named memsection is not found, + an error result is returned. + + @param[in] name Name of the memsection in cust_config.xml that specifies + this mapping. Should be less than 25 characters. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode + @param[in] flags Mapping modes, such as + Shared, Private, or Anonymous. + @param[in] offset Offset relative to the physical address range specified in memsection. + If offset + length exceeds size of memsection, failure is + returned. + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap_by_name(const char* name, + void *addr, + size_t length, + int prot, + int flags, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mprotect2 + Changes access permissions and attributes on an existing mapping based on the client_handle argument. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned. + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping.\n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect2(int client_handle, const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_mprotect + Changes access permissions and attributes on an existing mapping. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned.\n + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect(const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_munmap + Removes an existing mapping. + + @note1hang If the specified mapping is not found in the context of the caller process + or invalid attributes are passed, an error code is returned. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap(void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap2 + Removes an existing mapping for a specified process. + + @note1hang This API allows a root process entity, such as a driver, to remove mapping + that was created for a user process. If the specified mapping is not found in the context + of client handle or invalid attributes are passed, an error code is returned. + + @param[out] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap2(int client_handle, + void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap3 + Removes an existing mapping or reservation for a specified process. + + @param[in] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Pointer to a virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] flags Specifies the flag. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap3(int client_handle, + void *addr, + size_t length, + int flags); + +/* +|| The macros here follow the style of the standard mmap() macros, but with +|| QURT_ prepended to avoid name conflicts, and to avoid having a dependency +|| on sys/mman.h. +|| +|| Wherever possible, any values here that are also present in sys/mman.h +|| should have the same value in both places so that we can accept "mmap" +|| calls without having to remap parameters to new values. +|| +|| In the future, it would be desirable to have a regression test that +|| checks, for instance, that these macros match. Example: +|| +|| assert(QURT_MAP_FAILED == MAP_FAILED); +|| ... repeat as needed ... +*/ + +/** @addtogroup memory_mapping_macros +@{ */ +/** @cond */ +#define QURT_PROT_NONE 0x00U /**< */ +#define QURT_PROT_READ 0x01U /**< */ +#define QURT_PROT_WRITE 0x02U /**< */ +#define QURT_PROT_EXEC 0x04U /**< */ +#define QURT_PROT_NODUMP 0x08U /**< Skip dumping the mapping. During PD dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and the DSP process + crashes before the mapping is removed.*/ +#define QURT_PROT_ISLAND 0x10U /**< Island mapping. */ + +#define QURT_MAP_SHARED 0x0001U /**< Shared. */ +#define QURT_MAP_PRIVATE 0x0002U /**< Private. */ +/** @endcond */ +#define QURT_MAP_NAMED_MEMSECTION 0x0004U /**< Named memsection. */ +#define QURT_MAP_FIXED 0x0010U /**< Fixed virtual address. */ +#define QURT_MAP_RENAME 0x0020U /**< Rename. */ +#define QURT_MAP_NORESERVE 0x0040U /**< No reserve. */ +#define QURT_MAP_INHERIT 0x0080U /**< Inherit. */ +#define QURT_MAP_NONPROCESS_VPOOL 0x0100U /**< Use a virtual address outside of the default range of the + processes. This option is only supported in the root process + and only when virtual memory split is enabled in the XML. + The root process can use this flag to create mapping for a + user process, for example, if the virtual address is configured + for a 3G/1G split, the root process can use this flag to create + mapping in the top 1 GB area for the user process or the + lower 3 GB area for the root process. This is useful for + shared buffer use cases. */ +#define QURT_MAP_HASSEMAPHORE 0x0200U /**< Has semaphore. */ +#define QURT_MAP_TRYFIXED 0x0400U /**< Try to create a mapping for a virtual address that was passed. + If the passed virtual address fails, use a random virtual address. */ +#define QURT_MAP_WIRED 0x0800U /**< Wired. */ +#define QURT_MAP_FILE 0x0000U /**< File. */ +#define QURT_MAP_ANON 0x1000U /**< Allocate physical memory from the pool that was passed. + By default, memory is allocated from the default physpool. */ +#define QURT_MAP_VA_ONLY 0X2000U /**< Reserve a virtual address without + mapping it. */ + +/** @cond */ +#define QURT_MAP_ALIGNED(n) ((n) << QURT_MAP_ALIGNMENT_SHIFT) +#define QURT_MAP_ALIGNMENT_SHIFT 24 + + +#define QURT_MAP_ALIGNMENT_MASK QURT_MAP_ALIGNED(0xff) /**< */ +#define QURT_MAP_ALIGNMENT_64KB QURT_MAP_ALIGNED(16) /**< */ +#define QURT_MAP_ALIGNMENT_16MB QURT_MAP_ALIGNED(24) /**< */ +#define QURT_MAP_ALIGNMENT_4GB QURT_MAP_ALIGNED(32) /**< */ +#define QURT_MAP_ALIGNMENT_1TB QURT_MAP_ALIGNED(40) /**< */ +#define QURT_MAP_ALIGNMENT_256TB QURT_MAP_ALIGNED(48) /**< */ +#define QURT_MAP_ALIGNMENT_64PB QURT_MAP_ALIGNED(56) /**< */ +/** @endcond */ +#define QURT_MAP_FAILED ((void *) -1) /**< Mapping creation failed. */ + +/* +|| The macros below are extensions beyond the standard mmap flags, but follow +|| the style of the mmap flags. +*/ +/** @cond */ +// Describe bitfields in (prot) +#define QURT_PROT_CACHE_BOUNDS 16U,19U,7U /**< Bits 16 through 19 are cache attribute, default is 0. */ +#define QURT_PROT_BUS_BOUNDS 20U,21U,0U /**< Bits 20 through 21 are bus attributes, default is 0. */ +#define QURT_PROT_USER_BOUNDS 22U,23U,3U /**< Bits 22 through 23 are user mode, default is 3; + default of 3 means to derive user mode setting from the + default mode of the client. */ + +// Describe bitfields in (flags) +#define QURT_MAP_PHYSADDR_BOUNDS 15U,15U,0U /**< Bits 15 through 15 are physaddr, default is 0. */ +#define QURT_MAP_TYPE_BOUNDS 16U,19U,0U /**< Bits 16 through 19 are mapping type, default is 0. */ +#define QURT_MAP_REGION_BOUNDS 20U,23U,0U /**< Bits 20 through 23 are region type, default is 0. */ +/** @endcond */ + +// These macros get OR'ed into (prot) +#define QURT_PROT_CACHE_MODE(n) QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_BUS_ATTR(n) QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_USER_MODE(n) QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n)) /**< */ +// These macros get OR'ed into (flags) + +#define QURT_MAP_PHYSADDR QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. + This is allowed only for root process. */ +#define QURT_MAP_TYPE(n) QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_REGION(n) QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n)) /**< */ +/** @} */ /* end_addtogroup memory_mapping_macros */ +/** @cond */ +// These macros extract fields from (prot) +#define QURT_PROT_GET_CACHE_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_BUS_ATTR(n) QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_USER_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n)) /**< */ + +// These macros extract fields from (flags) +#define QURT_MAP_GET_TYPE(n) QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_GET_REGION(n) QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */ + +// Macros for bitfield insertion and extraction +#define QURT_MMAP_MASK(lo,hi) (~((~0u) << ((hi)-(lo)+1U))) /**< Mask of same size as [lo..hi]. */ +#define QURT_MMAP_BUILD_(lo,hi,def,n) ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */ +#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */ +#define QURT_MMAP_BUILD(a,b) QURT_MMAP_BUILD_(a,b) /**< */ +#define QURT_MMAP_EXTRACT(a,b) QURT_MMAP_EXTRACT_(a,b) /**< */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mq.h new file mode 100755 index 0000000000000..580c83d3de41a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mq.h @@ -0,0 +1,458 @@ +#ifndef QURT_MQ_H +#define QURT_MQ_H +/** + @file qurt_mq.h + + @brief Prototypes of secure message queues API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2019-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. +======================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_MQ_NAME_MAXLEN 16U /**< Maximum name length. */ + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/* This enum must be generated in accordance to process class class numbers. + For now it is made to match generated version, do not change this unless + there is a corresponding change in the process_class.py, indicies start from 0 + basically: QURT_MQ_SECURITY_SCOPE_ = (1 << QURTK_process_class_index_) +*/ +typedef enum { + QURT_MQ_SECURITY_SCOPE_KERNEL = ( 1U << 0 ), + QURT_MQ_SECURITY_SCOPE_SRM = ( 1U << 1 ), + QURT_MQ_SECURITY_SCOPE_SECURE = ( 1U << 2 ), + QURT_MQ_SECURITY_SCOPE_CPZ = ( 1U << 3 ), + QURT_MQ_SECURITY_SCOPE_ROOT = ( 1U << 4 ), + QURT_MQ_SECURITY_SCOPE_SIGNED = ( 1U << 5 ), + QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ), + QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 ) +} qurt_mq_security_scope_t; + +typedef enum { + QURT_MQ_CARDINALITY_PTP = (1U << 0), + QURT_MQ_CARDINALITY_MTO = (1U << 1) +}qurt_mq_cardinality_t; + +typedef unsigned int qurt_mqd_t; + +typedef union{ + struct { + unsigned int perms:2; + unsigned int cardinality:1; + unsigned int blocking:1; + + qurt_mq_security_scope_t creator_scope: 8; + qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO + unsigned int queue_closed: 1; + unsigned int reserved: 11; + }; //try to do anonymous struct + unsigned int raw; +} qurt_mq_flags_t; + + +/* permissions are from qurt_types.h , block X though */ +#if 0 +/** Memory access permission. */ +typedef enum { + QURT_PERM_READ=0x1U, /**< */ + QURT_PERM_WRITE=0x2U, /**< */ + QURT_PERM_EXECUTE=0x4U, /**< */ + QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE, /**< */ +} qurt_perm_t; +#endif + +struct qurt_mq_attr { + unsigned flags; /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */ + unsigned mq_maxmsg; /**< Maximum number of messages. Used with create() and get_attr. */ + unsigned short mq_send_msgsize; /**< Maximum size (bytes) of message in receiver facing queue, + from sender to receiver. */ + unsigned short mq_recv_msgsize; /**< Maximum size (bytes) of message in sender facing queue, + from receiver to sender. */ + unsigned client_pid; /**< Process ID of client that is allowed to open the message queue + that was created using qurt_mq_create(). */ + qurt_mq_cardinality_t cardinality; /**< Cardinality of message queue connection, see below. */ + qurt_mq_security_scope_t scope; /**< Security scope of the senders to the queue. */ +}; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mq_attr_init + Initializes attributes to default values used for creating the queue. + + The initialize operation sets the following default attribute values: \n + - flag - QURT_PERM_READ | QURT_PERM_WRITE \n + - maxmsg - 1 \n + - mq_send_msgsize - 8 \n + - mq_recv_msgsize - 8 \n + - sender_pid - -1 \n + - cardinality - QURT_MQ_CARDINALITY_PTP \n + - scope - QURT_MQ_SECURITY_SCOPE_SIGNED \n + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the initialized message queue object. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_init(struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_attr_set_send_msgsize + Sets the message size in bytes the sender can send. + Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_recv_msgsize + Sets the message size in bytes that the receiver can read. + Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_maxmsg + Sets the maximum message that can queue in the message queue. + Message depth is configurable using the XML configuration. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] depth Maximum message that can be queued. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth); + +/**@ingroup qurt_mq_attr_set_scope + Sets the scope of the message queue. A message queue created with a security + scope allows only a process class of that scope to open a message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mq_security_scope_t + + @param[in,out] attr Pointer to the message queue object. + @param[in] scope Scope of the message queue: \n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope); + + +/**@ingroup qurt_mq_attr_set_client_pid + Sets the client_pid that can open this message queue. + If client_pid is set, allowed_scope to open MQ shall not be considered. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] client_pid Valid PID for client process. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid); + +/**@ingroup qurt_mq_attr_set_flags + Sets the properties of the message queues. + The current implementation is only used to set the permission for the message queue using the flag attribute. + Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] flags Permission for message queue. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags); + +/**@ingroup qurt_mq_create + Create a message queue with the provided name and attributes. + The calling process becomes the owner of the queue. + Name of the message queue is limited to 16 characters including the NULL terminator. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue identifier if + the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] attr Pointer to the initialized message queue attribute + structure that specifies the attributes of the created message queue. + + @return + #QURT_EOK Message queue created. \n + #QURT_EINVALID Invalid arguments. \n + #QURT_ENOSPC Maximum number of queues in the system is exceeded. + + @dependencies + None. +*/ +int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_open + Opens a message queue connection between a process and a created message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue + identifier if the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] flags Flag that contains the properties that define the behavior of message queue connection. + Permissions:\n + #QURT_PERM_READ \n + #QURT_PERM_WRITE \n + #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend + Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n + Cardinality: \n + #QURT_MQ_CARDINALITY_PTP (default) \n + #QURT_MQ_CARDINALITY_MTO (not implemented) \n + Block suspend thread until the message queue with the apecified name is created. \n + Scope: security boundary to which the message queue and its users are constrained. + Block suspend thread until the message queue with the apecified name is created. \n + It is coupled with process privilege level/scope.\n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend + + @return + QURT_EOK -- Message queue connection successfully opened \n + QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n + QURT_ENOTALLOWED -- Open failed due to security scope mismatch + + @dependencies + None. +*/ +int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags); + +/**@ingroup qurt_mq_send + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send shall resume that thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] msg_len Length of the message buffer in bytes. + + @return + #QURT_EOK Message queue send was successful.\n + #QURT_EMSGSIZE Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED Send failed due to security scope mismatch. + + @dependencies + None. +*/ +int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); + +/**@ingroup qurt_mq_send_timed + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message or until timeout is reached. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] duration Interval (in microseconds) that the duration value must be + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in] msg_len Length of message buffer in bytes. + + @return + #QURT_EOK -- Message queue send was successful. \n + #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED -- Send failed due to security scope mismatch \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. +*/ +int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len); + + /**@ingroup qurt_mq_recv + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue. \n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv shall resume the thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in,out] msg_len Pointer to the length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID Message pointer or msg_len ptr are NULL. \n + #QURT_EBADR Message queue descriptior (mqd) is invalid. \n + #QURT_EBADF Sender closed the message queue. + + @dependencies + None. +*/ +int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len); + + /**@ingroup qurt_mq_recv_timed + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue or until timeout is reached.\n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in] duration Interval (in microseconds) that the duration value must be; + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in,out] msg_len Pointer to length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID -- Message ptr or msg_len ptr are NULL. \n + #QURT_EBADR -- Message queue descriptior (mqd) is invalid.\n + #QURT_EBADF -- Sender closed the message queue. \n + #QURT_ETIMEDOUT -- Timeout. + + @dependencies + None. +*/ +int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len); + + /**@ingroup qurt_mq_close + Closes the message queue and disassociates the calling process (client) from the message queue + under this descriptor. Marks the queue as closed for the receiver. + This function is expected to be called from the client side. If called + from the server side, the function reduces to no-op and returns success. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue close was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_close(qurt_mqd_t mqd); + + /**@ingroup qurt_mq_destroy + Destroys the message queue. This function ought to be + called from the process that called qurt_mq_create(). + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue destroy was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_destroy(qurt_mqd_t mqd); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif //QURT_MQ_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mutex.h new file mode 100755 index 0000000000000..4ad6b270cdde6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mutex.h @@ -0,0 +1,211 @@ +#ifndef QURT_MUTEX_H +#define QURT_MUTEX_H +/** + @file qurt_mutex.h + @brief Prototypes of mutex API. + This is mostly a user space mutex, but calls the + kernel to block if the mutex is taken. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT mutex type. + + Both non-recursive mutex lock and unlock, and recursive + mutex lock and unlock can be applied to this type. + */ +typedef union qurt_mutex_aligned8{ + /** @cond */ + struct { + unsigned int holder; + unsigned int count; + unsigned int queue; + unsigned int wait_count; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_mutex_t; +/** @} */ /* end_addtogroup mutex_types */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* @addtogroup mutex_const_macros +@{ */ +#define MUTEX_MAGIC 0xfe /**< */ +#define QURTK_FUTEX_FREE_MAGIC 0x1F // 11111 /**< */ +#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}} /**< Suitable as an initializer for a + variable of type qurt_mutex_t. */ +/* @} */ /* end_addtogroup mutex_const_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mutex_init + Initializes a mutex object. + The mutex is initially unlocked. + + @note1hang Each mutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_mutex_destroy() + when this object is not used anymore + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the mutex object. Returns the initialized object. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_destroy + Destroys the specified mutex. + + @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_lock + Locks the specified mutex. + If a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. + + @note1hang A thread is suspended indefinitely if it locks a mutex that it has already + locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}). + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_lock(qurt_mutex_t *lock); /* blocking */ + +/**@ingroup func_qurt_mutex_lock_timed + Locks the specified mutex. + When a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + When a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. If the duration of suspension exceeds the timeout duration, wait is + terminated and no access to mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object; specifies the mutex to lock. + @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration); + +/**@ingroup func_qurt_mutex_unlock + Unlocks the specified mutex. \n + More than one thread can be suspended on a mutex. When the mutex is unlocked, only the + highest-priority thread waiting on the mutex is awakened. If the awakened thread has + higher priority than the current thread, a context switch occurs. + + @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first + lock. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to unlock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_unlock(qurt_mutex_t *lock); /* unlock */ + +/**@ingroup func_qurt_mutex_try_lock + @xreflabel{hdr:qurt_mutex_try_lock} + Attempts to lock the specified mutex. + If a thread performs a try_lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + @note1hang If a thread performs a try_lock operation on a mutex that it has already locked + or is in use by another thread, qurt_mutex_try_lock immediately returns with a + nonzero result value. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_mutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_os_services.h new file mode 100755 index 0000000000000..cbc4c239e9620 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_os_services.h @@ -0,0 +1,24 @@ +/*============================================================================= + + qurt_os_services.c + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ + +#define QURT_OS_SERVICE_THREAD "/os/thread" /**< Thread service */ +#define QURT_OS_SERVICE_FS_HUB "/os/fs_hub" /**< file-system hub */ +#define QURT_OS_SERVICE_CALLBACK "/os/callback" /**< QDI callback service */ +#define QURT_OS_SERVICE_INTERRUPTS "/os/interrupt" /**< Interrupt service */ +#define QURT_OS_SERVICE_PROXY "/os/proxy" /**< QDI proxy serice */ +#define QURT_OS_SERVICE_MEMORY "/os/memory" /**< Memory management service */ +#define QURT_OS_SERVICE_MEMPOOL "/os/mempool" /**< Pool management service */ +#define QURT_OS_SERVICE_PROCESS "/os/process" /**< Process management service */ +#define QURT_OS_SERVICE_MMAP "/os/mem_mapper" /**< mmapper service */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex.h new file mode 100755 index 0000000000000..61aee5cba7ce8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_PIMUTEX_H +#define QURT_PIMUTEX_H 1 +/** + @file qurt_pimutex.h + @brief Prototypes of qurt_pimutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pimutex_init + Initializes a priority inheritance mutex object. + The priority inheritance mutex is initially unlocked. + + This function works the same as qurt_mutex_init(). + + @note1hang Each pimutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_pimutex_destroy() + when this object is not used anymore + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the priority inheritance mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_destroy + Destroys the specified priority inheritance mutex. + + @note1hang Priority inheritance mutexes must be destroyed when they are no longer in + use. Failure to do this causes resource leaks in the QuRT kernel.\n + @note1cont Priority inheritance mutexes must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_lock + Requests access to a shared resources. If a thread performs a lock operation on a mutex + that is not in use, the thread gains access to the shared resource that the mutex protects, + and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + If a thread is suspended on a priority inheritance mutex, and the priority of the suspended + thread is higher than the priority of the thread that has locked the mutex, the thread + with the mutex acquires the higher priority of the suspended thread. The locker thread blocks + until the lock is available. + + @note1hang A thread is not suspended if it locks a priority inheritance mutex that it has + already locked . However, the mutex does not become available to other + threads until the thread performs a balanced number of unlocks on the mutex.\n + @note1cont When multiple threads compete for a mutex, the lock operation for a priority + inheritance mutex is slower than it is for a recursive mutex. + In particular, it is about 10 times slower when the mutex is available for locking, + and slower (with greatly varying times) when the mutex is already locked. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_lock(qurt_mutex_t *lock); + + +/**@ingroup func_qurt_pimutex_lock_timed + Locks a priority inheritance mutex with timeout. + + A thread can lock a priority inheritance mutex for multiple times. The mutex is not + available to other threads until the thread performs the same number of mutex unlock + operations. + + If a thread performs a lock operation on a mutex that is already locked by another thread, + the thread is moved to waiting state. When the mutex becomes available again (because the + other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex. + + If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread + is higher than the priority of the thread that has locked the mutex, the priority of the thread + that has locked the mutex is raised to the same priority of the waiting thread. + + If the duration of waiting exceeds the timeout duration, the waiting is terminated, and + the function returns QURT_ETIMEDOUT as a failure of the mutex lock. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to lock. + @param[in] duration Duration (in microseconds) to wait. The duration value must be between + #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + #QURT_EINVALID -- Duration is out of range + + @dependencies + None. + + */ +int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + + +/**@ingroup func_qurt_pimutex_unlock + Releases access to a shared resource; unlocks the specified priority inheritance mutex. \n + More than one thread can be suspended on a priority inheritance mutex. When the mutex + is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + When a thread unlocks a priority inheritance mutex, its thread priority is restored to its + original value from any higher priority value that it acquired from another thread + suspended on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_try_lock + Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n + If a thread performs a try_lock operation on a priority inheritance mutex that is not in + use, the thread gains access to the shared resource that is protected by the mutex, and + continues executing. + If a thread performs a try_lock operation on a priority inheritance mutex that is already + in use by another thread, qurt_pimutex_try_lock immediately returns with a + nonzero result value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_pimutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex2.h new file mode 100755 index 0000000000000..b809f163cbfd2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex2.h @@ -0,0 +1,162 @@ +#ifndef QURT_PIMUTEX2_H +#define QURT_PIMUTEX2_H +/** + @file qurt_pimutex2.h + @brief Prototypes of pimutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_pimutex2_init + Initializes a recursive mutex object. + + @deprecated use #qurt_pimutex_init instead. + + The recursive mutex is initially unlocked. + + Objects of type pimutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_destroy + + @deprecated use #qurt_pimutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1cont Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code should destroy an pimutex2 object prior to + deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures + that all qurt_pimutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_lock + + @deprecated use #qurt_pimutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not being used, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing. + + If a thread performs a lock operation on a recursive mutex that is already being used by + another thread, the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_unlock + + @deprecated use #qurt_pimutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_pimutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_pimutex2_lock(). If a call to qurt_pimutex2_lock() would + succeed immediately, this function behaves similarly, and returns 0 for success. + If a call to qurt_pimutex2_lock() would not succeed immediately, this function has + no effect and returns non-zero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pipe.h new file mode 100755 index 0000000000000..6bdaa044f8640 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pipe.h @@ -0,0 +1,479 @@ +#ifndef QURT_PIPE_H +#define QURT_PIPE_H +/** + @file qurt_pipe.h + @brief Prototypes of the pipe interface API + This is a pipe or message queue + It blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup pipe_types +@{ */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_PIPE_MAGIC 0xF1FEF1FE /**< Magic. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */ + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** QuRT pipe data values type. */ +typedef unsigned long long int qurt_pipe_data_t; + +/** QuRT pipe type.*/ +typedef struct { + /** @cond */ + qurt_mutex_t pipe_lock; + qurt_sem_t senders; + qurt_sem_t receiver; + unsigned int size; + unsigned int sendidx; + unsigned int recvidx; + void (*lock_func)(qurt_mutex_t *); + void (*unlock_func)(qurt_mutex_t *); + int (*try_lock_func)(qurt_mutex_t *); + void (*destroy_lock_func)(qurt_mutex_t *); + unsigned int magic; + qurt_pipe_data_t *data; + /** @endcond */ +} qurt_pipe_t; + +/** QuRT pipe attributes type. */ +typedef struct { + /** @cond */ + qurt_pipe_data_t *buffer; + unsigned int elements; + unsigned char mem_partition; + /** @endcond */ +} qurt_pipe_attr_t; + +/** @} */ /* end_addtogroup pipe_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pipe_attr_init + @xreflabel{hdr:qurt_pipe_attr_init} + Initializes the structure that sets the pipe attributes when a pipe is created. + + After an attribute structure is initialized, the individual attributes in the structure are + explicitly set using the pipe attribute operations. + + The attribute structure is assigned the following default values: \n + - buffer -- 0 \n + - elements -- 0 \n + - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr) +{ + attr->buffer = NULL; + attr->elements = 0; + attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer + @xreflabel{sec:qurt_pipe_attr_set_buffer} + Sets the pipe buffer address attribute.\n + Specifies the base address of the memory area to use for the data buffer of a pipe. + + The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the + memory area used as a pipe data buffer. The user is responsible for allocating the + memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t \n + #qurt_pipe_data_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] buffer Pointer to the buffer base address. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer) +{ + attr->buffer = buffer; +} + +/**@ingroup func_qurt_pipe_attr_set_elements + @xreflabel{sec:qurt_pipe_attr_set_elements} + Specifies the length of the memory area to use for the data buffer of a pipe. + + The length is expressed in terms of the number of 64-bit data elements that + can be stored in the buffer. + + The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify + the memory area used as a pipe data buffer. The user is responsible for + allocating the memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] elements Pipe length (64-bit elements). + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements) +{ + attr->elements = elements; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer_partition + @xreflabel{sec:qurt_pipe_attr_set_buffer_partition} + Specifies the memory type where a pipe's buffer is allocated. + Allocate pipes in RAM or TCM/LPM. + + @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created + with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] mem_partition Pipe memory partition. Values: \n + - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n + - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition) +{ + attr->mem_partition = mem_partition; +} + +/**@ingroup func_qurt_pipe_create + Creates a pipe.\n + Allocates a pipe object and its associated data buffer, and initializes the pipe object. + + @note1hang The buffer address and size stored in the attribute structure specify how the + pipe data buffer is allocated. + + @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created + using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the created pipe object. + @param[in] attr Pointer to the attribute structure used to create the pipe. + + @return + #QURT_EOK -- Pipe created. \n + #QURT_EFAILED -- Pipe not created. \n + #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM. + + @dependencies + None. + */ +int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_init + Initializes a pipe object using an existing data buffer. + + @note1hang The buffer address and size stored in the attribute structure must + specify a data buffer that the user has already allocated. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the pipe object to initialize. + @param[in] attr Pointer to the pipe attribute structure used to initialize the pipe. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Failure. + + @dependencies + None. + */ +int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_destroy + @xreflabel{sec:qurt_pipe_destroy} + Destroys the specified pipe. + + @note1hang Pipes must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel. + Pipes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_destroy(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_delete + Deletes the pipe.\n + Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its + associated data buffer. + + @note1hang Delete pipes only if they were created using qurt_pipe_create + (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n + @note1cont Pipes must be deleted when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Pipes must not be deleted while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_delete(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_send + Writes a data item to the specified pipe. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + None. + + @dependencies + None. +*/ +void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_receive + Reads a data item from the specified pipe. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + Integer containing the 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_try_send + Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n + + If a thread writes to a full pipe, the operation returns immediately with success set to -1. + Otherwise, success is always set to 0 to indicate a successful write operation. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + 0 -- Success. \n + -1 -- Failure (pipe full). + + @dependencies + None. +*/ +int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_try_receive + Reads a data item from the specified pipe (without suspending the thread if the pipe is + empty).\n + If a thread reads from an empty pipe, the operation returns immediately with success set + to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n + + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[out] success Pointer to the operation status result. + + @return + Integer containing a 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success); + +/**@ingroup func_qurt_pipe_receive_cancellable + Reads a data item from the specified pipe (with suspend), cancellable. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + The operation is cancelled if the user process of the calling thread is killed, + or if the calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] result Pointer to the integer containing the 64-bit data item from pipe. + + @return + #QURT_EOK -- Receive completed. \n + #QURT_ECANCEL -- Receive canceled. \n + #QURT_EDESTROY -- Receive destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result); + +/**@ingroup func_qurt_pipe_send_cancellable + @xreflabel{hdr:qurt_pipe_send_cancellable} + Writes a data item to the specified pipe (with suspend), cancellable. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + The operation is canceled if the user process of the calling thread is killed, or if the + calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] data Data item to write. + + @return + #QURT_EOK -- Send completed. \n + #QURT_ECANCEL -- Send canceled. \n + #QURT_EDESTROY -- Send destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_is_empty + Returns a value indicating whether the specified pipe contains any data. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + 1 -- Pipe contains no data. \n + 0 -- Pipe contains data. + + @dependencies + None. +*/ +int qurt_pipe_is_empty(qurt_pipe_t *pipe); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIPE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmem_manager.h new file mode 100755 index 0000000000000..8c8da985228b9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmem_manager.h @@ -0,0 +1,82 @@ +#ifndef QURT_PMEM_MANAGER_H +#define QURT_PMEM_MANAGER_H +/** + @file qurt_pmem_manager.h + Prototypes of kernel physical memory manager APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* physical memory API return error code */ +#define QURT_PMEM_SUCCESS 0 +#define QURT_PMEM_NO_PRIV 1 +#define QURT_PMEM_RETRY 2 +#define QURT_PMEM_OVERLAP 3 +#define QURT_PMEM_NOT_EXIST 4 +#define QURT_PMEM_INIT_FAILURE 5 +#define QURT_PMEM_OUTSTANDING_MAPPING 6 +#define QURT_PMEM_GENERIC_FAILURE 7 +#define QURT_PMEM_ENTRY_FOUND 8 +#define QURT_PMEM_REACH_END 9 +#define QURT_PMEM_UNCLAIMED 10 +#define QURT_PMEM_ALREADY_CLAIMED 11 + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_pmem_acquire + Acquire the ownership of a specific physical memory region. + + @note1hang The ownership will be the caller + + @param[in] ppage Starting physical page number + @param[in] pnum Number of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. +*/ +int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum); + +/**@ingroup func_qurt_pmem_release + Release the ownership of a specific physical memory region. + + @param[in] ppage The start of physical page number + @param[in] pnum The numbers of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_NOT_EXIST -- The physical memory range is not usable. \n + #QURT_PMEM_OUTSTANDING_MAPPING -- There is outstanding mapping in this range + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. + */ +int qurt_pmem_release(unsigned int ppage, unsigned int pnum); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMEM_MANAGER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmu.h new file mode 100755 index 0000000000000..73ea8eba04abf --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmu.h @@ -0,0 +1,121 @@ +#ifndef QURT_PMU_H +#define QURT_PMU_H +/** + @file qurt_pmu.h + Prototypes of pipe interface API. + A pipe or message queue blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_pmu_set + Sets the value of the specified PMU register. + + @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0 + through PMUCNT3. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @param[in] reg_value Register value. + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_set (int reg_id, unsigned int reg_value); + +/**@ingroup func_qurt_pmu_get + Gets the PMU register.\n + Returns the current value of the specified PMU register. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @return + Integer -- Current value of the specified PMU register. + + @dependencies + None. + */ +unsigned int qurt_pmu_get (int reg_id); + +/**@ingroup func_qurt_pmu_enable + Enables or disables the Hexagon processor PMU. + Profiling is disabled by default. + + @note1hang Enabling profiling does not automatically reset the count registers -- this must + be done explicitly before starting event counting. + + @param[in] enable Performance monitor. Values: \n + - 0 -- Disable performance monitor \n + - 1 -- Enable performance monitor @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_enable (int enable); + +/**@ingroup func_qurt_pmu_get_pmucnt + Reads PMU counters in a single trap. + + @param[out] buf Pointer to a buffer to save values read from PMU counters. + buffer size should be at least 32 bytes to read all eight PMU counters. + + @return + #QURT_EOK -- Successful read.\n + #QURT_EFATAL -- Failure. + + @dependencies + None. + */ +int qurt_pmu_get_pmucnt (void * buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMU_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_power.h new file mode 100755 index 0000000000000..2ee4d29a73976 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_power.h @@ -0,0 +1,140 @@ +#ifndef QURT_POWER_H +#define QURT_POWER_H +/** + @file qurt_power.h + @brief Prototypes of power API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/*============================================================================= + + EDIT HISTORY FOR MODULE + + This section contains comments describing changes made to the module. + Notice that changes are listed in reverse chronological order. + + +when who what, where, why +-------- --- ------------------------------------------------------------ +03/03/11 op Add header file +12/12/12 cm (Tech Pubs) Edited/added Doxygen comments and markup. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +/**@ingroup func_qurt_power_shutdown_fail_exit + Returns from Power Collapse mode when power collapse cannot proceed. + + This function unmasks the global interrupt. This operation is used only when the thread is + recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}). + + @return + #QURT_EOK -- Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_fail_exit qurt_power_exit + +/**@ingroup func_qurt_power_shutdown_exit + Undoes state changes made preparing for power collapse.\n + This function unmasks the global interrupts. + + @return + #QURT_EOK --Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_exit qurt_power_exit +/**@endcond */ + +/**@ingroup func_qurt_system_ipend_get + Gets the IPEND register.\n + + @note1hang Returns the current value of the Hexagon processor IPEND register. The return value + is a mask value that identifies the individual interrupts that are pending. \n + + @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A + mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the + corresponding interrupt is not pending. \n + + @return + Return the IPEND register value. + + @dependencies + None. + */ +unsigned int qurt_system_ipend_get (void); + + +/**@ingroup func_qurt_system_vid_get + Gets the VID register. \n + + @note1hang Returns the current value of the Hexagon processor VID register. The return value is + the vector number of a second-level interrupt that has been accepted by the Hexagon + processor core.\n + + @return + Return the VID register value that is the L2 VIC interrupt number accepted by the processor. + Valid range is 0 to 1023. + + @dependencies + None. + */ +unsigned int qurt_system_vid_get(void); + +/**@ingroup func_qurt_power_shutdown_get_pcycles + Gets the number of power collapses and processor cycles for entering and exiting most recent + power collapse. + + @note1hang If no power collapse has occured yet, processor cycle numbers are zero. + + @param[out] enter_pcycles Number of processor cycles for entering most + recent power collapse. + @param[out] exit_pcycles Number of processor cycles for exiting most + recent power collapse. + @return + Zero -- No power collapses have occurred. \n + Nonzero -- Number of power collapses that have occurred since + the processor was reset. + + @dependencies + None. + */ +int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles, unsigned long long *exit_pcycles ); + +/**@ingroup func_qurt_system_tcm_set_size + Set size of TCM to save during full power collapse. + + @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in + XML, the size is truncated to the size defined in XML. + + @param[in] new_size Size of TCM to save. + + @return + Zero -- Size successfully set \n + -1 -- Size of 0 passed + + @dependencies + None. + */ +int qurt_system_tcm_set_size(unsigned int new_size); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_POWER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_printf.h new file mode 100755 index 0000000000000..a775d8a815918 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_printf.h @@ -0,0 +1,44 @@ +#ifndef QURT_PRINTF_H +#define QURT_PRINTF_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + @file qurt_printf.h + Prototypes of printf API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup chapter_function_tracing +@{ */ + +int qurt_printf(const char* format, ...); + +int qurt_vprintf(const char* format, va_list args); + +/** @} */ /* end_addtogroup chapter_function_tracing */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PRINTF_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_process.h new file mode 100755 index 0000000000000..0df9ddc2d4a70 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_process.h @@ -0,0 +1,995 @@ +#ifndef QURT_PROCESS_H +#define QURT_PROCESS_H +/** + @file qurt_process.h + @brief Prototypes of QuRT process control APIs. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_callback.h" +#include "qurt_consts.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup process_types +@{ */ +#define QURT_PROCESS_ATTR_NAME_MAXLEN QURT_MAX_NAME_LEN /**< Maximum length of the process name. */ +#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN 128 /**< Maximum length of the path of binary/ELF for this process. */ +#define QURT_PROCESS_ATTR_CAP_MAXLEN 128 /**< Maximum length for a resource name. */ + +/** QuRT process capability wildcard strings */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL "ALLOW_ALL" /**< Capability wild-card for full access */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE "ALLOW_NONE" /**< Capability wild-card for no access */ + +/** QuRT process capability states */ +#define QURT_PROCESS_ATTR_CAP_ENABLED 0x1 /**< Capability enabled*/ +#define QURT_PROCESS_ATTR_CAP_DISABLED 0x0 /**< Capability disabled*/ + +/* QuRT process thread attributes. */ +#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0 /**< Default ceiling priority of the threads in the new process. */ +#define QURT_PROCESS_DEFAULT_MAX_THREADS -1 /**< Default number of threads in the new process. + -1 indicates that the limit is set to the maximum supported by the system. */ + +/* QuRT process flags. */ +#define QURT_PROCESS_SUSPEND_ON_STARTUP (1U) /**< Suspend the new processes just before calling main(). */ +#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */ +#define QURT_PROCESS_ISLAND_RESIDENT (1u << 2) /**< Process is island resident. */ +#define QURT_PROCESS_RESTARTABLE (1u << 3) /**< Indicates that the process is restartable */ +#define QURT_PROCESS_UNTRUSTED (1u << 7) /**< Starts the new process as unsigned process. */ + +/* QuRT process debugging session status.*/ +#define QURT_DEBUG_NOT_START 0 /**< Debug is not started. */ +#define QURT_DEBUG_START 1 /**< Debug has started. */ + +/** Process Suspend Options */ +#define QURT_PROCESS_SUSPEND_DEFAULT 0 + +/** Process Resume Options */ +#define QURT_PROCESS_RESUME_DEFAULT 0 + + +/* QuRT process types. */ +typedef enum { + QURT_PROCESS_TYPE_RESERVED, /**< Process type is reserved. \n */ + QURT_PROCESS_TYPE_KERNEL, /**< Kernel process. \n*/ + QURT_PROCESS_TYPE_SRM, /**< SRM process. \n*/ + QURT_PROCESS_TYPE_SECURE, /**< Secure process. \n*/ + QURT_PROCESS_TYPE_ROOT, /**< Root process. \n*/ + QURT_PROCESS_TYPE_USER, /**< User process. */ +}qurt_process_type_t; + +/** QuRT process callback types. */ +typedef enum { + QURT_PROCESS_DUMP_CB_ROOT, /**< Register the callback that executes in the + root process context. \n */ + QURT_PROCESS_DUMP_CB_ERROR, /**< Register the user process callback that is + called after threads in the process are frozen. \n */ + QURT_PROCESS_DUMP_CB_PRESTM, /**< Register the user process callback that is + called before threads in the process are frozen. \n*/ + QURT_PROCESS_DUMP_CB_MAX /**< Reserved for error checking. */ +}qurt_process_dump_cb_type_t; + +/** QuRT process dump attributes. */ +typedef struct _qurt_pd_dump_attr{ + /** @cond */ + unsigned int enabled; /**< Process dump is enabled. */ + const char *path; /**< Process dump path. */ + unsigned int path_len; /**< Length of process dump path. */ + /** @endcond */ +}qurt_pd_dump_attr_t; + +/** QuRT process capability resource type */ +enum qurt_process_cap_type_t { + QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0, /**< Number of entries in the capability structure*/ + QURT_PROCESS_CAP_TYPE_DRIVER=1, /**< Driver resource */ + QURT_PROCESS_CAP_TYPE_MAX /**< Maximum identifier */ +}; + +/** QuRT process capability structure */ +typedef struct _qurt_capability { + enum qurt_process_cap_type_t type; /**< Resource type */ + char name[QURT_PROCESS_ATTR_CAP_MAXLEN]; /**< Resource name*/ + unsigned long long cap; /**< Capabilities allowed for this resource */ +}qurt_capability_t; + +/** QuRT process attributes. */ +typedef struct _qurt_process_attr { + /** @cond */ + char name[QURT_PROCESS_ATTR_NAME_MAXLEN]; /**< Name of the new process. */ + char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the binary for the new process. */ + char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the DTB ELF for the new process. */ + int flags; /**< Flags as indicated by QuRT process flags. */ + unsigned int sw_id; /**< Software ID of the process be load. */ + unsigned sid; /**< Stream ID of the process being spawned. */ + unsigned max_threads; /**< Maximum number of threads that the new process can create. */ + unsigned short ceiling_prio; /**< Maximum priority at which threads can be + created by new process. */ + qurt_process_type_t type; /**< Process type as indicated by + #qurt_process_type_t. */ + qurt_pd_dump_attr_t dump_attr; /**< Process dump attributes for the new process + as indicated by #qurt_pd_dump_attr_t. */ + qurt_capability_t *capabilities; /**< Pointer to array of structure of type + qurt_capability_t */ + /** @endcond */ +} qurt_process_attr_t; + +/** @} */ /* end_addtogroup process_types */ + +/*============================================================================= +FUNCTIONS +=============================================================================*/ + /** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_create + Creates a process with the specified attributes, and starts the process. + + The process executes the code in the specified executable ELF file. + + @datatypes + #qurt_process_attr_t + + @param[out] attr Accepts an initialized process attribute structure, which specifies + the attributes of the created process. + + @return + Postive return value Indicates Process ID. + Negative return value Indicates any of follwoing error, + #-QURT_EPRIVILEGE -- Caller does not have privilege for this operation \n + #-QURT_EMEM -- Not enough memory to perform the operation \n + #-QURT_EFAILED -- Operation failed \n + #-QURT_ENOTALLOWED -- Operation not allowed \n + #-QURT_ENOREGISTERED -- Not registered \n + #-QURT_ENORESOURCE -- Resource exhaustion \n + #-QURT_EINVALID -- Invalid argument value + #QURT_EFATAL -- attr is NULL + + @dependencies + None. +*/ +int qurt_process_create (qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_get_id + Returns the process identifier for the current thread. + + @return + None. + + @dependencies + Process identifier for the current thread. +*/ +int qurt_process_get_id (void); +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_get_uid + Returns the user identifier for the current thread. + + @return + None. + + @dependencies + User identifier for the current thread. +*/ +int qurt_process_get_uid (void); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_init + Initializes the structure that sets the process attributes when a thread is created. + + After an attribute structure is initialized, the individual attributes in the structure can + be explicitly set using the process attribute operations. + + Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize + operation. + + @inputov{table_process_attribute_defaults} + + @datatypes + #qurt_process_attr_t + + @param[out] attr Pointer to the structure to initialize. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_process_attr_init (qurt_process_attr_t *attr) +{ + attr->name[0] = '\0'; + attr->path[0] = '\0'; + attr->dtb_path[0] = '\0'; + attr->flags = 0; + attr->sw_id = 0; + attr->sid = 0; + attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS; + attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO; + attr->type = QURT_PROCESS_TYPE_RESERVED; + attr->dump_attr.enabled = 0; + attr->dump_attr.path = NULL; + attr->dump_attr.path_len = 0; + attr->capabilities = NULL; +} + +/**@ingroup func_qurt_process_attr_set_executable + Sets the process name in the specified process attribute structure. + + Process names identify process objects that are already + loaded in memory as part of the QuRT system. + + @note1hang Process objects are incorporated into the QuRT system at build time. + + @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] name Pointer to the process name. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name); + +/**@ingroup func_qurt_process_attr_set_binary_path + Sets the binary path for the process loading in the specified process attribute structure. + + Path specifies the binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_dtb_path + Sets the DTB binary path for the process loading in the specified process attribute structure. + + Path specifies the DTB binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_flags +Sets the process properties in the specified process attribute structure. +Process properties are represented as defined symbols that map into bits +0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing +together the individual property symbols. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical. + This attribute will be used by error services, + to decide whether to kill user pd or whole subsystem. + QURT_PROCESS_ISLAND_RESIDENT Process will be marked as island resident. + QURT_PROCESS_RESTARTABLE Process will be marked as restartable. + QURT_PROCESS_UNTRUSTED Process will be marked as unsigned process. +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags) +{ + attr->flags = flags; +} +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_sid +Sets the process streamID in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sid streamID to set for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid) +{ + attr->sid = sid; +} +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_set_max_threads +Sets the maximum number of threads allowed in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] max_threads Maximum number of threads allowed for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads) +{ + attr->max_threads = max_threads; +} + +/**@ingroup func_qurt_process_attr_set_sw_id +Sets the software ID of the process to load in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sw_id Software ID of the process, used in authentication. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id) +{ + attr->sw_id = sw_id; +} + +/**@ingroup func_qurt_process_attr_set_ceiling_prio +Sets the highest thread priority allowed in the specified process attribute structure. +Refer qurt_thread.h for priority ranges. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] prio Priority. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio) +{ + attr->ceiling_prio = prio; +} +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_dump_status +Sets the process domain dump-enabled field in the process domain dump attributes. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] enabled 1 -- Process domain dump is collected \n + 0 -- Process domain dump is not collected + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled) +{ + attr->dump_attr.enabled = enabled; +} + +/**@ingroup func_qurt_process_attr_set_dump_path +Sets the process domain dump path and type. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] path Path where the process domain dumps must be saved. +@param[in] path_len Length of the path string. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len) +{ + attr->dump_attr.path = path; + attr->dump_attr.path_len = (unsigned int)path_len; +} + +/**@ingroup func_qurt_process_attr_set_capabilities +Sets list of capabilities available to this process. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] capabilities Pointer to array of structures of type qurt_capability_t defining + resources and capabilites + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities) +{ + attr->capabilities = capabilities; +} + +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_cmdline_get +Gets the command line string associated with the current process. +The Hexagon simulator command line arguments are retrieved using +this function as long as the call is made +in the process of the QuRT installation, and with the +requirement that the program runs in a simulation environment. + +If the function modifies the provided buffer, it zero-terminates +the string. It is possible that the function does not modify the +provided buffer, so the caller must set buf[0] to a NULL +byte before making the call. A truncated command line is returned when +the command line is longer than the provided buffer. + +@param[in] buf Pointer to a character buffer that must be filled in. +@param[in] buf_siz Size (in bytes) of the buffer pointed to by the buf argument. + +@return +None. + +@dependencies +None. +*/ +void qurt_process_cmdline_get(char *buf, unsigned buf_siz); + +/**@ingroup func_qurt_process_get_thread_count +Gets the number of threads present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of threads in the process indicated by PID, if positive value is obtained +Negative error code if failed include: + QURT_EFATAL - Invalid PID + -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID + +@dependencies +None. +*/ +int qurt_process_get_thread_count(unsigned int pid); + +/**@ingroup func_qurt_process_get_thread_ids +Gets the thread IDs for a process indicated by PID. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a user passed buffer that must be filled in with thread IDs. +@param[in] thread_num Number of thread IDs requested. + +@return +#QURT_EOK - Success +#QURT_EFATAL - Failed, ptr is NULL + +@dependencies +None. + */ +int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num); +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_dump_get_mem_mappings_count +Gets the number of mappings present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of mappings for the process indicated by the PID. + +@dependencies +None. +*/ +int qurt_process_dump_get_mem_mappings_count(unsigned int pid); + +/**@ingroup func_qurt_process_dump_get_mappings +Gets the mappings for a specified PID. + +@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a buffer that must be filled in with mappings. +@param[in] count Count of mappings requested. + +@return +Number of mappings filled in the buffer passed by the user. + +@dependencies +None. +*/ +int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_get +Gets the attributes of the process with which it was created. + +@datatypes +#qurt_process_attr_t + +@param[in] pid PID of the process for which the information is required. +@param[in,out] attr Pointer to the user allocated attribute structure. + +@return +#QURT_EOK - Success +#QURT_INVALID - Invalid PID +#QURT_EFATAL - attr is NULL + +@dependencies +None. +*/ +int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_dump_register_cb +Registers the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information. +@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n + #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n + #QURT_PROCESS_DUMP_CB_ERROR -- After threads are frozen and captured. \n + #QURT_PROCESS_DUMP_CB_ROOT -- After threads are frozen and captured, and CB_ERROR type of callbacks + are called. +@param[in] priority Priority. + +@return +#QURT_EOK -- Success \n +Other values -- Failure + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority); + +/**@ingroup func_qurt_process_dump_deregister_cb +Deregisters the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information to deregister. +@param[in] type Callback type. + +@return +#QURT_EOK -- Success.\n +Other values -- Failure. + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type); + +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_set_rtld_debug +Sets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in] address rtld_debug address. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address); + +/**@ingroup func_qurt_process_get_rtld_debug +Gets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address); +/** @endcond */ +/**@ingroup func_qurt_process_exit +Exits the current user process with an exit code. + +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exit(int exitcode); + +/**@ingroup func_qurt_process_kill +Kills the process represented by the PID with the exit code. + +@param[in] pid PID of the process to kill. +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_kill(int pid, int exitcode); + + +/**@ingroup func_qurt_debugger_register_process +Registers the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. +@param[in] adr Address. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_register_process(int pid, unsigned int adr); + + +/**@ingroup func_qurt_debugger_deregister_process +Deregister the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_deregister_process(int pid); + +/**@ingroup func_qurt_process_exec_callback +Executes callbacks in the user process as indicated by the client_handle argument. + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] callback_fn Callback function to execute. +@param[in] stack_base Stack address to use. +@param[in] stack_size Stack size. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exec_callback(int client_handle, + unsigned callback_fn, + unsigned stack_base, + unsigned stack_size); + +/**@ingroup func_qurt_process_get_pid +Gets the process ID of the process that the client_handle argument represents. + +@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id() + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] pid Pointer to the address to store the PID. + +@return +#QURT_EOK -- Success +#QURT_EFATAL -- pid pointer passed as NULL + +@dependencies +None. +*/ +int qurt_process_get_pid(int client_handle, int * pid); + +/**@ingroup func_qurt_process_get_dm_status +Gets the debugging session status on the process represented by the pid argument. + +@param[in] pid Process ID +@param[in,out] status Address to store the status: \n + #QURT_DEBUG_NOT_START \n + #QURT_DEBUG_START + +@return +#QURT_EOK - Success \n +#QURT_EINVALID - Error + +@dependencies +None. +*/ +int qurt_process_get_dm_status( unsigned int pid, unsigned int *status); + + +/**@ingroup func_qurt_process_suspend_threads + Suspends user threads in a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in GuestOS/root process. + After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel + until they resume later. + + This function has one optional argument with one default option. + #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + This function call is a synchronous call, the function returns after the relevant threads are + completely suspended. + + If some user threads in the target user process are set as non-suspendable, this function call does + not suspend these threads. + + If the target user process is already suspended, this function call returns success as the + confirmation on the user process suspending. + + QuRT debugger monitor threads in the target user process are non-suspendable, this function call does + not suspend the threads. + + If the target user process is a secure user process, or a CPZ process, this function call returns error + without suspending the target user process. + + If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call + does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended + when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success while the user thread can be running in GuestOS, and is suspended + when exiting the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid process_id input \n + #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_suspend_threads (unsigned int process_id, unsigned int option); + + +/**@ingroup func_qurt_process_resume_threads + Resumes a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in the guest OS/root process. + After the user threads in the target user process resume, the kernel scheduler + can schedule the user threads to run based on their thread priorities. + + This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which + resumes user threads in the target user process. + + This is an asynchronous function, it returns after the kernel moves the user thread from + suspended state to runnable state. The threads are scheduled to run based on their thread priorities. + + This function call does not resume threads in the target user process that have been set as non-resumable. + + If the target user process have already resumed, this function call confirms that the user process resumes + by returning success. + + If the target user process is a secure user process or a CPZ process, this function call returns an error without + resuming operation. + + If user threads in the target user process run in the guest OS/root process via QDI call, this function + call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits + the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process. + + @return + #QURT_EOK -- Success + #QURT_EINVALID -- Failure because of invalid process_id input. + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_resume_threads (unsigned int process_id, unsigned int option); + +/**@ingroup func_qurt_process_vtcm_window_set + Set a VTCM access window for a process. + The caller thread needs to be in SRM process. + + This is an synchronous function, it ensures all running threads of the process have the requested + window in effect.The requested view for all non-running thread will take in effect when they get + scheduled. + + @param[in] pid Process identifier. + @param[in] enable QURT_VTCM_WINDOW_ENABLE enforces VTCM access window defined by high and low offset. + QURT_VTCM_WINDOW_DISABLE high and low offset is ignored and VTCM access is fully + disabled for the process. + @param[in] high_offset Specifies the high window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT restore high offset to reset value. + @param[in] low_offset Specifies the low window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value. + + @note1hang + when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT and low offset is set as + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled + via MMU mapping for the process. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset); + +/**@ingroup func_qurt_process_vtcm_window_get + Get the VTCM window for a process. + The caller thread needs to be in SRM process. + + + @param[in] pid Process identifier. + @param[out] enable address to store enable status if set + @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM + @param[out] low_offset address to return low window offset, in 4K increments, from the base address of the VTCM. + + @note1hang + User must first check the value of enable returned before checking high and low offset. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset); + +/**@ingroup func_qurt_process_set_group_config + Enable thread groups in the process with the ceiling priorities setup + + @param[in] process_id Process identifier. + @param[in] group_bitmask 64-bit mask of active thread groups + @param[in] ceiling_priorities array of ceiling priorities for thread group + + @note1hang + This API can only be called by root PD and can only be called once for each process, otherwise it will be + rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all + exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling + priority of group 0, it will be lowered to the ceiling value. + Examples 1: + group_bitmask = 0xD7; //'b11010111 + ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care + Exmaples 2: + group_mask = 0x5; //'b101 + ceiling_priorities[] = {240, 0, 20}; // 0 - does not care + + + @return + #QURT_EOK -- Success. + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_ENOTALLOWED -- The group has been configured already. + + @dependencies + None. + */ +int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask, + unsigned char *ceiling_priorities); + + +/**@ingroup func_qurt_process_stid_set + Set the specified stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[in] stid stid to be set + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level. + All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process. + When a non-default group_id is specified, the stid is set only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid); + +/**@ingroup func_qurt_process_stid_get + Get the stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[out] Pointer to a variable to return stid + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid. + When a non-default group_id is specified, the stid is returned only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_profile.h new file mode 100755 index 0000000000000..2a50c461440f6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_profile.h @@ -0,0 +1,98 @@ +#ifndef QURT_PROFILE_H +#define QURT_PROFILE_H +/** + @file qurt_profile.h + QuRT profiling support. + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup profiling_macros +@{ */ +#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */ +#define QURT_PROFILE_ENABLE 1 /**< Enable profiling. */ + +typedef unsigned int qurt_profile_param_t; + +#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */ + +/** @} */ /* end_addtogroup profiling_macros */ + +/** @addtogroup profiling_types + @{ */ +/** Profiling results. */ +typedef union +{ + /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME. */ + struct + { + unsigned int ticks; /**< Cumulative ticks the thread was ready. */ + } thread_ready_time; + +} qurt_profile_result_t; +/** @} */ /* end_addtogroup profiling_types */ + +/**@ingroup func_qurt_profile_enable2 + * Starts profiling of a specific parameter on a specific thread (as applicable). + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of the thread (if applicable) for which the specified + * paramter must be profiled. + * @param[in] enable #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- + * enable + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EALREADY -- Measurement already in progress or already stopped \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_enable2 ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + int enable +); + +/**@ingroup func_qurt_profile_get + * Gets the value of the profiling parameter that was previously enabled. + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of thread (if applicable) for which the specified + * profiling paramter must be retrieved. + * @param [out] result Profiling result associated with the parameter for the specified + * thread (if applicable). + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EFAILED -- Operation failed; profiling was not enabled \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_get ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + qurt_profile_result_t * result +); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ptrace.h new file mode 100755 index 0000000000000..622304dd92865 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ptrace.h @@ -0,0 +1,37 @@ +/*============================================================================= + + qurt_ptrace.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SYS_PTRACE_H__ +#define __SYS_PTRACE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +enum __ptrace_request +{ + /** + Indicates that the process making this request is requesting to be traced. + */ + PTRACE_TRACEME = 0, + PTRACE_EXT_IS_DEBUG_PERMITTED = 500 +}; + +long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data); + +#ifdef __cplusplus +} +#endif + +#endif //__SYS_PTRACE_H__ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi.h new file mode 100755 index 0000000000000..705408e5cfc6f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi.h @@ -0,0 +1,185 @@ +#ifndef QDI_H +#define QDI_H + +/** + @file qurt_qdi.h + @brief Prototypes of QuRT Driver Invocation API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_qdi_open + Opens the specified driver for subsequent operations. + qurt_qdi_open() is the primary mechanism by which a driver user can + obtain a QDI handle. The user provides the name of the driver to the + qurt_qdi_open call, and gets back a handle referencing + the named driver. \n + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_open function prototype is not actually defined as a varargs. + + + @param[in] p Driver name. + @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, + and should follow the POSIX open() convention. \n + - flags -- Optional second parameter (POSIX flags), the handle + access requested (read-only, write-only, or read-write, + for instance) and other flags such as whether the call + should create a new device or only open an existing + device. \n + - mode -- Optional third parameter (POSIX mode); permissions to + configure when a new device is created. @tablebulletend + + @return + Negative value -- Error. \n + Non-negative value -- Success, this result value serves as a handle to the + opened driver. + @dependencies + None. + */ +// int qurt_qdi_open(); +#define qurt_qdi_open(p,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__) + +#define qurt_qdi_open_dt(p,q,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__) + +/**@ingroup func_qurt_qdi_handle_invoke + Performs a generic driver operation, which (depending on the specified operation) can be + either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} + or a driver-specific operation. + The user provides a QDI handle and an integer + method number, along with 0 to 8 optional 32-bit arguments. + The device driver invocation function is invoked with the + same method number and 0 to 8 optional arguments. The + return value from the invocation function is passed back to + the user as the return value of qurt_qdi_handle_invoke. + + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_handle_invoke() function prototype is not actually defined as a + varargs function (and would break if it were defined this way). + + @param[in] h Driver handle. + @param[in] m Integer number for the operation to perform. + @param[in] ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n + arg1 -- First parameter \n + arg2 -- Second parameter \n + arg3 -- Third parameter \n + arg4 -- Fourth parameter \n + arg5 -- Fifth parameter \n + arg6 -- Sixth parameter \n + arg7 -- Seventh parameter \n + arg8 -- Eighth parameter + + @return + Integer value defined by the device driver. \n + -1 -- Error. + + @dependencies + None. + */ +// int qurt_qdi_handle_invoke(); +#define qurt_qdi_handle_invoke(h,m,...) \ + _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__) +#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c) +#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d)) +#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e)) +#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) +int qurt_qdi_qhi3(int,int,int); +int qurt_qdi_qhi4(int,int,int,int); +int qurt_qdi_qhi5(int,int,int,int,int); +int qurt_qdi_qhi6(int,int,int,int,int,int); +int qurt_qdi_qhi7(int,int,int,int,int,int,int); +int qurt_qdi_qhi8(int,int,int,int,int,int,int,int); +int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int); + +/**@ingroup func_qurt_qdi_write + Writes data to the specified driver. + A predefined invocation routine for drivers that + support a POSIX-like write functionality. + qqurt_qdi_write(handle, buf, len) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data to write is stored. + @param[in] len Number of bytes of data to write. + + @return + Non-negative integer -- Number of bytes written. \n + Negative error code -- Write could not take place. + + @dependencies + None. + */ +int qurt_qdi_write(int handle, const void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_read + User-visible API to read data from a QDI handle. + A predefined invocation routine for drivers that + support a POSIX-like read functionality. + qurt_qdi_read(handle, buf, len) is equivalent to: + qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data read is stored. + @param[in] len Number of bytes of data to read. + + @return + Non-negative integer number -- Bytes read. \n + Negative error code -- Read could not take place. + + @dependencies + None. + */ +int qurt_qdi_read(int handle, void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_close + Closes the specified driver, releasing any resources associated with the open driver. + User-visible API to close a QDI handle. + + This API should be called when the user is done using a + QDI-based handle. When this function is called, the driver can release + any resources held and perform other necessary cleanup + operations. qurt_qdi_close(handle) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle) + + @param[in] handle Driver handle. + + @return + 0 -- Success.\n + Negative error code -- Failure. + + @dependencies + None. + */ +int qurt_qdi_close(int handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_constants.h new file mode 100755 index 0000000000000..4866fada067f0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_constants.h @@ -0,0 +1,193 @@ +#ifndef QDI_CONSTANTS_H +#define QDI_CONSTANTS_H + +/** + @file qurt_qdi_constants.h + @brief Predefined invocation methods for drivers. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Method numbers used for QDI. +|| +|| Intended grouping of method numbers for QDI +|| including future usage: +|| +|| Method 0 should always be unused and not responded to by +|| any driver. +|| Methods 1 and 2 are reserved for name registration and +|| name lookup. +|| Methods 3 through 31 are reserved for POSIX-type operations +|| on open handles. +|| Methods 32 through 127 are reserved for the QDI infrastructure +|| and may be extended in the future to provide standard +|| driver debug services, management services, and system +|| notifications. +|| Methods 128 through 255 are reserved for the use of automatically +|| generated methods such as might be generated by an IDL (interface +|| definition language). The infrastructure may be extended to +|| perform services on these methods based on information provided +|| by the IDL, such as automatic buffer validation, etc. These +|| method numbers should not be used for any "ad hoc" methods. +|| Methods with number >= 256 are "private" method numbers that are +|| outside the scope of the QDI infrastructure. Drivers that want +|| to generate and consume their own "ad hoc" methods are free to +|| use these method numbers as they wish. The infrastructure does +|| not generate these method numbers or respond to them, but +|| passes them on unmolested. +|| +|| All driver implementations *should* return a value of +|| -1 when called with an unsupported method. The standard error +|| return value for POSIX APIs is -1, so we emulate that behavior +|| here. +*/ +/** @cond */ +#define QDI_UNUSED 0 +#define QDI_DEVNAME_REGISTER 1 +#define QDI_OPEN 2 +#define QDI_CLOSE 3 +#define QDI_READ 4 +#define QDI_WRITE 5 +#define QDI_IOCTL 6 +#define QDI_MMAP 7 +#define QDI_OS_FILEOPEN 8 +#define QDI_FLEN 9 +#define QDI_UNLINK 10 +#define QDI_FTELL 22 +#define QDI_SEEK 23 +#define QDI_FSTAT 24 + +#define QDI_FSNAME_REGISTER 150 +#define QDI_FS_OPEN 151 +#define QDI_MMAP2 153 +#define QDI_MPROTECT2 154 +#define QDI_MUNMAP2 155 + +#define QDI_CLIENT_HANDLE_OBJREF_GET 10 + +#define QDI_OS_PROCESS_LOAD 12 +#define QDI_OS_PROCESS_CHOOSE_ASID 13 + +#define QDI_OS_SET_GP 26 +#define QDI_CLIENT_HANDLE_CALLBACK 27 + +#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T 19 //reused +#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80 +#define QDI_CLIENT_HANDLE_HANDLE_RELEASE 81 +#define QDI_CLIENT_HANDLE_COPY_FROM_USER 82 +#define QDI_CLIENT_HANDLE_COPY_TO_USER 83 +#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE 86 +#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS 87 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK 41 +#define QDI_CLIENT_HLOSPOOL_INFO_GET 90 +#define QDI_CLIENT_HLOSPOOL2_INFO_GET 96 + +#define QDI_CLIENT_PID 44 +#define QDI_CLIENT_ASID QDI_CLIENT_PID + +#define QDI_OS_CLIENT_INFO_GET 48 + +#define QDI_OS_MEM_LOOKUP_PHYSADDR 57 + +#define QDI_OS_THREAD_ITERATOR_CREATE 68 +#define QDI_OS_THREAD_ITERATOR_NEXT 69 + +#define QDI_OS_SYSENV 78 + +#define QDI_REGION_USERMALLOC_INIT 180 // This method is for generic handle + + +#define QDI_CLIENT_HANDLE_USER_MALLOC 84 +#define QDI_CLIENT_HANDLE_USER_FREE 85 + +#define QDI_SIGNAL_GROUP_SIGNAL_CREATE 96 +#define QDI_SIGNAL_GROUP_WAIT 98 +#define QDI_SIGNAL_GROUP_POLL 99 +#define QDI_SIGNAL_SET 96 +#define QDI_SIGNAL_CLEAR 97 +#define QDI_SIGNAL_WAIT 98 +#define QDI_SIGNAL_POLL 99 + +#define QDI_OS_WAIT_FOR_MAIN_REAPER 104 + +#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL 105 +#define QDI_CLIENT_HANDLE_REFPROXY_ADD 106 +#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE 107 + +#define QDI_CLIENT_HANDLE_DETACH 116 + +#define QDI_OS_RESERVED1 139 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK2 142 + +#define QDI_DT_REGISTER 158 +#define QDI_OPEN_DEVICE 159 +#define QDI_OPEN_FROM_DT 160 + +#define QDI_PRIVATE 256 /* Method numbers beginning at 256 + are private method numbers, which + are device-specific and available + for use by device implementors. */ +/* +|| Permission bitmasks for use with qurt_qdi_lock_buffer(). +|| +|| Make sure these match with permission values from qurt_perm_t. +*/ +/** @endcond */ + +/** @addtogroup driver_support_constants +@{ */ +#define QDI_PERM_W 2 /**< Write access. */ +#define QDI_PERM_R 1 /**< Read access. */ +#define QDI_PERM_RW (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */ + +#define QDI_HANDLE_LOCAL_CLIENT 3 /**< Local client. */ +#define QDI_HANDLE_GENERIC 4 /**< Generic. */ + +#define QDI_REFCNT_BASE 0x510000 /**< */ +#define QDI_REFCNT_MAXED 0x51FFFD /**< */ +#define QDI_REFCNT_INIT 0x51FFFE /**< Driver object is temporary and is eventually deleted.*/ +#define QDI_REFCNT_PERM 0x51FFFF /**< Driver object is permanent and is never deleted. */ +/** @} */ /* end_addtogroup driver_support_constants */ + +/** @cond */ +/* +|| Flags used by process loaders. +*/ + +#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT 0x1 /* Set this flag to request the loaded process + to have island residency. */ +#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT 0x2 /* Set this flag to request the loaded process + to have root residency, for example, DL Pager. */ +/* +|| Constants used for qurt_event register API, type field. +*/ + +#define QURT_PROCESS_EXIT 1 + +/* +|| Constants used by QDI extensions. +*/ + +#define QURT_QDI_SINGLETON_TYPE_TRUE 0 +#define QURT_QDI_SINGLETON_TYPE_FALSE 1 +#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS 2 +/** @endcond */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QDI_CONSTANTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_driver.h new file mode 100755 index 0000000000000..e044e25f1bb72 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_driver.h @@ -0,0 +1,868 @@ +#ifndef QURT_QDI_DRIVER_H +#define QURT_QDI_DRIVER_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "stddef.h" +#include "qurt_qdi.h" +#include "qurt_types.h" +#include "qurt_callback.h" +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" +#include "qurt_mutex.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| This gives the canonical form for the arguments to a QDI +|| driver invocation function. The arguments are as follows: +|| +|| int client_handle (R0) QDI handle that represents the client +|| that made this QDI request. If the +|| client is remote, this is a +|| variable handle; if the client is local +|| (same thread and process), this is +|| set to QDI_HANDLE_LOCAL_CLIENT. +|| +|| qurt_qdi_obj_t *obj (R1) Points at the qdi_object_t structure +|| on which this QDI request is being made. +|| The qdi_object_t structure is usually +|| the first element of a larger structure +|| that contains state associated with the +|| object; because it is usually the first +|| element, the object pointers can be freely +|| interchanged through casts. +|| +|| int method (R2) Integer QDI method that represents +|| the request type. +|| +|| qurt_qdi_arg_t arg1 (R3) First three general purpose arguments +|| qurt_qdi_arg_t arg2 (R4) to the invocation function are passed in +|| qurt_qdi_arg_t arg3 (R5) these slots. +|| +|| qurt_qdi_arg_t arg4 (SP+0) Arguments beyond the first three are +|| qurt_qdi_arg_t arg5 (SP+4) passed on the stack. +|| qurt_qdi_arg_t arg6 (SP+8) +|| qurt_qdi_arg_t arg7 (SP+12) +|| qurt_qdi_arg_t arg8 (SP+16) +|| qurt_qdi_arg_t arg9 (SP+20) +|| +|| The canonical form of the invocation function takes a +|| total of 12 arguments, but not all of them are used. In general, +|| the QDI infrastructure only passes those arguments provided by +|| the caller; if the invocation function accesses additional +|| arguments beyond those provided by the caller, the values are not +|| useful. +*/ +/** @cond */ +#define QDI_INVOKE_ARGS \ + int, struct qdiobj *, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define QDI_EXT_INVOKE_ARGS \ + int, qurt_qdi_man_obj_t*, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define BUFFER_LOCK 1 +#define BUFFER_UNLOCK 0 + +struct qdiobj; +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef union { + void *ptr; /**< Pointer to the driver handle. */ + int num; /**< Method number. */ +} qurt_qdi_arg_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI driver version */ +typedef union { + int num; + struct { + short major; /** Driver major version number. */ + short minor; /** Driver minor version number. */ + }; +} qurt_qdi_version_t; + +typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS); +typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *); +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef struct qdiobj { + qurt_qdi_pfn_invoke_t invoke; /**< Invocation function that implements the driver methods.*/ + int refcnt; /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of + references to a driver instance. */ + qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance + of the driver object.*/ +} qurt_qdi_obj_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI managed object */ +typedef struct qurt_qdi_man_obj +{ + qurt_qdi_obj_t qdi_obj; + union + { + struct qurt_qdi_ext_driver * opener_obj; + struct qurt_qdi_ext_device * device_obj; + }; +}qurt_qdi_man_obj_t; + +typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS); +typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj); +typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device); + +typedef struct qurt_qdi_ext_obj_info{ + qurt_qdi_man_obj_t *obj; + int qdi_client_id; + struct qurt_qdi_ext_obj_info *next; +}qurt_qdi_ext_obj_info_t; +typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr; + +/** QuRT QDI device */ +//temporarily add this back while there are still drivers who statically define this structure +struct qurt_qdi_device { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; +}; +typedef struct qurt_qdi_device qurt_qdi_man_device; + +struct qurt_qdi_ext_driver { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + qurt_qdi_ext_pfn_create_device_t create_device; + qurt_qdi_version_t version; + qurt_qdi_ext_pfn_probe_t probe; + const char* compatible; + struct qurt_qdi_ext_device * device_list; + //qurt_qdi_ext_device_ptr device_list; +}; +typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t; +//above replaces qurt_qdi_man_device + +extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *); +extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *); + +extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS); +/** @endcond */ +/**@ingroup func_qurt_qdi_method_default + Processes a method that is unrecognized or unsupported in the driver invocation function. + All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded + to this function. + + @note1hang Invocation functions must process all unrecognized or unsupported methods + by calling this function. + + @return + None. + + @dependencies + None. +*/ +extern int qurt_qdi_method_default(QDI_INVOKE_ARGS); + +/**@ingroup func_qurt_qdi_handle_create_from_obj_t + Allocates a new device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[out] obj Pointer to the driver object. + + @return + Non-negative integer -- Success; this value is the new handle. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_invoke + Allocates a new island device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). + @param[in] obj Pointer. + + @return + Non-negative integer value that is the new handle -- Success. \n + Negative return value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_release + Deallocates the specified device handle. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] handle_to_release Handle to release. + + @return + 0 -- Success. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_RELEASE, + handle_to_release); +} + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_handle(int client_handle, int object_handle) +{ + qurt_qdi_obj_t *ret; + + ret = NULL; + + qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_OBJREF_GET, + object_handle, + &ret); + + return ret; +} + +/**@ingroup func_qurt_client_add_memory + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size); + +/**@ingroup func_qurt_client_add_memory2 + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting 36-bit address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size); + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr) +{ + qurt_qdi_obj_t * ret = NULL; + + if (qurt_qdi_obj_ref_inc(objptr) < 0) { + ret = NULL; + } else { + ret = objptr; + } + + return ret; +} + +static __inline void +qurt_qdi_objref_release(qurt_qdi_obj_t *objptr) +{ + if (qurt_qdi_obj_ref_dec(objptr) == 1) { + (*objptr->release)(objptr); + } +} + +/**@ingroup func_qurt_qdi_copy_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the driver buffer. + @param[in] src Base address of the user buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_FROM_USER, + dest, src, len); +} + +/**@ingroup qurt_qdi_copy_string_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param dest Base address of the driver buffer. + @param src Base address of the user buffer. + @param len Number of bytes to copy. NOTE: This is the destination buffer length. + + @return + Negative error result -- privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len); + +/**@ingroup func_qurt_qdi_copy_to_user + Copies the contents of a driver memory buffer to user memory. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the user buffer. + @param[in] src Base address of the driver buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_TO_USER, + dest, src, len); +} + +/**@ingroup func_qurt_qdi_safe_cache_ops + Do cache operations on user memory + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] addr Base address of the user memory. + @param[in] size Size of the user memory. + @param[in] opcode Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...) + @param[in] type Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE) + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size, + qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SAFE_CACHE_OPS, + addr, size, opcode, type); +} + + +/**@ingroup func_qurt_qdi_buffer_lock + Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI + client. + + This function is used to permit a trusted driver to safely access memory that is + provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + + This function performs the following security checks: \n + - Verifies that the entire buffer is accessible to the client. \n + - Ensures that the pointer remains valid for the remainder of the QDI driver + operation. \n + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] buf Pointer to the base address of the client buffer address. + @param[in] len Buffer length (in bytes). + @param[in] perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + - #QDI_PERM_R -- Read access \n + - #QDI_PERM_W -- Write access \n + - #QDI_PERM_RW -- Read/write access @tablebulletend + @param[out] obuf Pointer to the buffer address that the driver must use to access the buffer. + + @return + Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n + Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission. + In this case the QDI driver call must be terminated cleanly, with an appropriate error code + returned to the client. \n + Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the + pointer passed in as buf -- even if the user process changes the mapping of memory at buf, + the mapping of memory at *obuf remains valid until the driver invocation completes. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK, + buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_lock2 + Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI + client. + This API permits a trusted driver to safely access memory + provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + This function performs the following security checks: \n + -- Entire buffer is accessible to the client. \n + -- Entire buffer is mapped with permissions passed in perms field \n + -- Entire buffer is physically contiguous \n + In addition to the security checks, the API also locks the client mapping such that the client + cannot remove the mapping while the physical memory is used by the trusted + driver. \n + + @note1 Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not + pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client. If the client exits abruptly, the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Safe buffer + unmapping or user buffer unlock is not supported in Island mode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + -- #QDI_PERM_R -- Read access \n + -- #QDI_PERM_W -- Write access \n + -- #QDI_PERM_RW -- Read/write access \n + @param obuf Optional parameter that returns a pointer to the buffer address that + the driver must use to access the buffer. If NULL is passed, the API + only performs security checks and does not create a mapping to access the user buffer in + a safe way. + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n + QURT_EFAILED -- Mapping cannot be created for the trusted driver. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_LOCK, buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_unlock + This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping + created for the driver is removed. Client mapping for the user buffer is + unlocked. + + @note1 Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not + pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client, and if the client exits abruptly, all the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Actual + unmapping of driver accessible memory or unlocking of the buffer is not + supported in Island bode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param obuf Safe buffer address that was returned in the obuf field after calling + qurt_qdi_buffer_lock2(). + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. \n + other results -- Safe buffer unmapping failed or unlocking of user buffer failed \n. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len, + void *obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_UNLOCK, buf, len, obuf); +} + +/**@ingroup func_qurt_qdi_user_malloc + Allocates memory area in the QDI heap that is read/write accessible to both the driver and + the client. \n + @note1hang The QDI heap has a limited amount of memory available, and only the + device driver can free the allocated memory. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param size Size. + + @return + Non-zero -- Success; this returned value points to the allocated memory area. \n + Zero -- Error. + + @dependencies + None. +*/ +void *qurt_qdi_user_malloc(int client_handle, unsigned size); + +/**@ingroup func_qurt_qdi_user_free + Deallocates memory area in the QDI heap. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param ptr Pointer. + + @dependencies + None. +*/ +void qurt_qdi_user_free(int client_handle, void *ptr); + +/**@ingroup funct_qurt_qdi_client_detach + Detaches a client (a process), indicating that the client does not + participate in the qurt_wait() mechanism. This behavior + is opt-in and irrevocable. When a client is detached, it can + not be un-detached. + + @param client_handle Handle of the client to detach. + + @return + Zero -- Success. Detachable clients always return success. + Nonzero value -- client_handle did not refer to a + detachable user client. + + @dependencies + None. +*/ +static __inline int qurt_qdi_client_detach(int client_handle) +{ + return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH); +} + +/**@ingroup func_qurt_qdi_signal_group_create + Creates a new signal group for use in a device driver. + A QDI signal group contains up to 32 signals, which can be operated on either + individually (using the qurt_qdi_signal_* functions) or as a group (using the + qurt_qdi_signal_group_* functions). \n + @note1hang Driver implementation is responsible for using the proper signal group + handle in any given situation. \n + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param p_signal_group_handle_local Returns a handle intended for use by code that + resides in the same context and process as the created signal group + (for example, the device driver implementation that allocated the + signal group). + @param p_signal_group_handle_remote Returns a handle intended for use by code + that resides in a different context and process than the created signal group + (for example, the user-mode client of an OS driver). + + @return + Zero return value indicates success.\n + Negative return value indicates could not create signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_create(int client_handle, + int *p_signal_group_handle_local, + int *p_signal_group_handle_remote) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE, + p_signal_group_handle_local, + p_signal_group_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_group_wait + Suspends the current thread until any of the signals are set in the specified signal group. + + If a signal is set in a signal group object, and a thread waits on the signal group object, + the thread is awakened. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @param signal_group_handle Handle of the signal group. + + @return + If the client is remote: + QURT_EOK -- Wait complete \n + QURT_ECANCEL -- Wait cancelled.\n + If the client is local, returns a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_wait(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_group_poll + Returns a value that indicates if any of the signals are set in the specified signal group. + + @param signal_group_handle Handle of the signal group. + + @return + 1 -- Indicates whether any of the signals are set in the signal group.\n + 0 -- Indicates that none of the signals are set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_poll(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_POLL); +} + + +/**@ingroup func_qurt_qdi_signal_create + Creates a new signal in the specified signal group. + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @note1hang Driver implementation is responsible for using the proper signal handle in + any given situation. + + @param signal_group_handle Handle of an existing signal group. + @param p_signal_handle_local Returns a handle intended for use by code that resides in + the same context and process as the created signal (for example, + the device driver implementation that allocated the signal). + @param p_signal_handle_remote Returns a handle intended for use by code that resides in + a different context and process than the created signal (for + example, the user-mode client of an OS driver). + + @return + Nonzero value -- No more signals can be created in the specified + signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_create(int signal_group_handle, + int *p_signal_handle_local, + int *p_signal_handle_remote) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_SIGNAL_CREATE, + p_signal_handle_local, + p_signal_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_set + Sets the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_set(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_SET); +} + +/**@ingroup func_qurt_qdi_signal_clear + Clears the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_clear(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_CLEAR); +} + +/**@ingroup func_qurt_qdi_signal_wait + Suspends the current thread until the specified signal is set. + If a signal is set in a signal object, and a thread waits on the signal object, the + thread is awakened. If the awakened thread has higher priority than the current thread, a + context switch may occur. + + @param signal_handle Handle of the signal. + + @return + If client is remote: + QURT_EOK -- Wait complete. \n + QURT_ECANCEL -- Wait cancelled.\n + If client is local, return a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_wait(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_poll + Returns a value that indicates if the specified signal is set. + + @param signal_handle Handle of the signal. + + @return + 1 -- Signal is set. \n + 0 -- Signal is not set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_poll(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_POLL); +} + +/**@ingroup func_qurt_qdi_devname_register + Registers a QDI device with the generic QDI object in the + current QDI context. + + This function registers an exact name or a directory prefix with a QDI opener object. + Future invocations of qurt_qdi_open() in the context of the caller invokes the + opener object if a match is detected. + + Directory prefix names are specified by ending the name with a forward slash character. + + Example of an exact name: + @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode + + Example of a directory prefix: + @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode + + Given the two registrations shown above, the only qurt_qdi_open() requests to + direct to the foobar_opener object are requests for the exact name + "/dev/foobar", Any request beginning with "/pipedev/" is directed to the + pipedev_opener object. + + The pipedev invocation function presumably examines the name argument to + determine exactly how to handle the request. The name is passed to the invocation + function in the a1.ptr argument (Section @xref{sec:invocationFunction}). + + @param name Device name or device name prefix. + @param opener Pointer to the opener object for the device. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. + + @dependencies + None. + */ +static __inline int qurt_qdi_devname_register(const char *name, + qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, + QDI_DEVNAME_REGISTER, + name, + opener); +} + +// Macros for backward compatibility with deprecated APIs +// (These will go away soon) + +#define qurt_qdi_register_devname(name, opener) \ + qurt_qdi_devname_register((name), (void *)(opener)) +#define qurt_qdi_new_handle_from_obj_t(handle, obj) \ + qurt_qdi_handle_create_from_obj_t((handle), (obj)) +#define qurt_qdi_release_handle(client_handle, handle) \ + qurt_qdi_handle_release((client_handle), (handle)) +#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \ + qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf)) +#define qurt_qdi_usermalloc(handle, size) \ + qurt_qdi_user_malloc((handle), (size)) +#define qurt_qdi_userfree(handle, ptr) \ + qurt_qdi_user_free((handle), (ptr)) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_ext.h new file mode 100755 index 0000000000000..383e1799a15d6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_ext.h @@ -0,0 +1,58 @@ +#ifndef QURT_QDI_EXT_H +#define QURT_QDI_EXT_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct qurt_qdi_ext_device { + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + struct qurt_qdi_ext_device * next; + char * instance; + fdt_node_handle context; +}; +typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr; + +/**@ingroup func_qurt_qdi_dt_register + Registers a QDI device with the generic QDI object in the current QDI context, + if and only if a compatible device node is found in the device tree. This + function serves as a device tree aware wrapper for qurt_qdi_devname_register(). + + @param name Device name or device name prefix. + @param opener Pointer to QDI ext specialized opener object for the driver. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. +*/ +static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener); +} + +static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name) +{ + device->instance = name; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_imacros.h new file mode 100755 index 0000000000000..c0a8448ac87f8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_imacros.h @@ -0,0 +1,34 @@ +#ifndef QURT_QDI_IMACROS_H +#define QURT_QDI_IMACROS_H + +/** + @file qurt_qdi_imacros.h + @brief Internal macros used for QDI. Mostly consists of tricky (and ugly) + preprocessor hacks that permit us to do varargs function invocations + where we pass optional arguments in registers and where we can do + type casting and checking automatically. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define _QDMPASTE(a,b) _QDMPASTE_(a,b) +#define _QDMPASTE_(a,b) a##b +#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0) +#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_proxy.h new file mode 100755 index 0000000000000..f1d8992ea8811 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_proxy.h @@ -0,0 +1,55 @@ +/*============================================================================= + + qurt_qdi_proxy.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef _QURT_QDI_PROXY_H +#define _QURT_QDI_PROXY_H + +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* APIs allowing operation on the proxy object directly */ +int qurt_qdi_proxy_ref_create(void); + +/* APIs allowing to operate on proxy given a known proxy handle + * 1) using qdi handle of the object + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle); +int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle); + +/* 2) using object reference + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); +int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); + +/* API allowing to associate a proxy object with a particular client given a client handle + * successfule return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_install (int client_handle, int proxy_handle); + +/* APIs allowing operation on proxy object from user client + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_add(int qdi_handle); +int qurt_client_proxy_ref_remove(int qdi_handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_QDI_PROXY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex.h new file mode 100755 index 0000000000000..a013a0bbddb1d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_RMUTEX_H +#define QURT_RMUTEX_H +/** + @file qurt_rmutex.h + Prototypes of rmutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_rmutex_init + Initializes a recursive mutex object. + The recursive mutex is initialized in unlocked state. + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_destroy + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex protects, and continues executing. + + If a thread performs a lock operation on a mutex that is already use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock_timed + Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex is protecting, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked by itself. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + If timeout expires, this wait must be terminated and no access to the mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + + */ +int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + +/**@ingroup func_qurt_rmutex_unlock + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a mutex. When the mutex is + unlocked, the thread waiting on the mutex awakens. If the awakened + thread has higher priority than the current thread, a context switch occurs. + + @note1hang When a thread unlocks a recursive mutex, the mutex is not available until + the balanced number of locks and unlocks has been performed on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock + Attempts to lock the specified recursive mutex.\n + + If a thread performs a try_lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing.\n + If a thread performs a try_lock operation on a recursive mutex that another thread has + already locked, qurt_rmutex_try_lock immediately returns with a nonzero result + value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex_try_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock_block_once + Attempts to lock a mutex object recursively. If the mutex is available, + it locks the mutex. If the mutex is held by the current thread, + it increases the internal counter and returns 0. If not, it returns a + nonzero value. + If the mutex is already locked by another thread, the caller thread is + suspended. When the mutex becomes available again (because the other + thread has unlocked it), the caller thread is awakened and tries to lock + the mutex; and if it fails, this function returns failure with a nonzero + value. If it succeeds, this function returns success with zero. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the qurt_mutex_t object. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex2.h new file mode 100755 index 0000000000000..a37e7e4458c4b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex2.h @@ -0,0 +1,183 @@ +#ifndef QURT_RMUTEX2_H +#define QURT_RMUTEX2_H +/** + @file qurt_rmutex2.h + @brief Prototypes of rmutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT rmutex2 type. + Mutex type used with rmutex2 APIs. + */ +typedef struct { + /** @cond */ + unsigned int holder __attribute__((aligned(8))); /* UGP value of the mutex holder. */ + unsigned short waiters; /* Number of waiting threads. */ + unsigned short refs; /* Number of references to this mutex. */ + unsigned int queue; /* Kernel-maintained futex queuevalue. */ + unsigned int excess_locks; /* Number of excess times the holder has locked the mutex. */ + /** @endcond */ +} qurt_rmutex2_t; +/** @} */ /* end_addtogroup mutex_types */ +/** @cond internal_only*/ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_rmutex2_init + + @deprecated use #qurt_rmutex_init instead. + + Initializes a recursive mutex object. + + The recursive mutex is initially unlocked. + + Objects of type rmutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_destroy + + @deprecated use #qurt_rmutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code must destroy an rmutex2 object prior to + deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures + that all qurt_rmutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_lock + + @deprecated use #qurt_rmutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that the mutex protects, and continues + to execute. + + If a thread performs a lock operation on a recursive mutex that another thread is using, + the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_unlock + + @deprecated use #qurt_rmutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex awakens. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_rmutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() + succeeds immediately, this function behaves similarly, returning 0 for success. + When a call to qurt_rmutex2_lock() does not succeed immediately, this function has + no effect and returns nonzero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sclk.h new file mode 100755 index 0000000000000..a83cf5f1db889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sclk.h @@ -0,0 +1,145 @@ +#ifndef QURT_SCLK_H +#define QURT_SCLK_H +/** + @file qurt_sclk.h + @brief Header file describing the APIs supported by QuRT system SCLK + feature. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + + +/*============================================================================= + + INCLUDE FILES + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + + +/** + Conversion from microseconds to sleep ticks. + */ +#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL) +#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL) +#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL) +#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS +/** + Sleep timer error margin for Qtimer is 192 ticks ~10 us. +*/ +#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq; +#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN + +/*============================================================================= + + DATA DECLARATIONS + +=============================================================================*/ + +/**@ingroup func_qurt_sysclock_get_hw_ticks + @xreflabel{sec:qurt_sysclock_get_hw_ticks} + Gets the hardware tick count.\n + Returns the current value of a 64-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation must be used with care because of the wrap-around behavior. + + @return + Integer -- Current value of 64-bit hardware counter. + + @dependencies + None. + */ +unsigned long long qurt_sysclock_get_hw_ticks (void); + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_32 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_32} + Gets the hardware tick count in 32 bits.\n + Returns the current value of a 32-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 32 bits are the lower 32 bits of the Qtimer counter. + + @return + Integer -- Current value of the 32-bit timer counter. + + @dependencies + None. + */ +static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void) +{ + //Beginning with v61 there is a HW register that can be read directly. + unsigned long count; + __asm__ __volatile__ (" %0 = c30 " : "=r"(count)); + return count; +} + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_16 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_16} + Gets the hardware tick count in 16 bits.\n + Returns the current value of a 16-bit timer counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 16 bits are based on the value of the lower 32 bits in Qtimer + counter, right shifted by 16 bits. + + @return + Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the + Qtimer counter, right shifted by 16 bits. + + @dependencies + None. + */ + + +static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void) +{ + unsigned long ticks; + + //Beginning with v61 there is a HW register that can be read directly. + __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks)); + __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks)); + + return (unsigned short)ticks; +} +unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks); +#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif /* __cplusplus */ + +#endif /* QURT_SCLK_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_secure_proc.h new file mode 100755 index 0000000000000..f40c7deb9bca1 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_secure_proc.h @@ -0,0 +1,53 @@ +#ifndef QURT_SECURE_PROC_H +#define QURT_SECURE_PROC_H + +/** + @file qurt_secure_proc.h + @brief Definitions, macros, and prototypes used for handling secure process + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_process_migrate_secure_process + Migrate the user process to Qurt secure process + + @param secure_phy_address Physical starting address of secure memory + @param secure_memory_size Size of secure memory + @param entry Entry function to secure process + + @return + EOK + Negative return value -- Error. + + @dependencies + None. +*/ +int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size, void entry(unsigned)); + +/**@ingroup qurt_process_get_migration_mem_size + get the size of all writable memory regions in a user PD. This is for preparation on secure process migration. + + @return + size of all writable memory regions in a user PD. + + @dependencies + None. +*/ +int qurt_process_get_migration_mem_size(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sem.h new file mode 100755 index 0000000000000..ee5ce4b2d94ab --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sem.h @@ -0,0 +1,252 @@ +#ifndef QURT_SEM_H +#define QURT_SEM_H +/** + @file qurt_sem.h + Prototypes of semaphore API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup semaphore_types +@{ */ + +/** QuRT semaphore type. */ +typedef union { + /** @cond */ + unsigned int raw[2] __attribute__((aligned(8))); + struct { + unsigned short val; /**< */ + unsigned short n_waiting; /**< */ + unsigned int reserved1; /**< */ + unsigned int queue; /**< */ + unsigned int reserved2; /**< */ + }X; /** @endcond */ +} qurt_sem_t; +/** @} */ /* end_addtogroup semaphore_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_sem_add + Releases access to a shared resource (the specified amount increments the semaphore count value).\n + When a thread performs an add operation on a semaphore, the specified value increments the semaphore count. + The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing. \n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel repeatedly awakens the highest-priority waiting thread and decrements + the semaphore count value until either no waiting threads remain or the + semaphore count value is zero. If any of the awakened threads has higher priority + than the current thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] amt Amount to increment the semaphore count value. + + @return + Unused integer value. + + @dependencies + None. + + */ +int qurt_sem_add(qurt_sem_t *sem, unsigned int amt); + +/**@ingroup func_qurt_sem_up + Releases access to a shared resource. When a thread performs an up operation on a semaphore, + the semaphore count value increments. The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing.\n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel awakens the highest-priority waiting thread and decrements the + semaphore count value. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); } + +/**@ingroup func_qurt_sem_down + Requests access to a shared resource. When a thread performs a down operation on a + semaphore, the result depends on the semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +int qurt_sem_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_down_timed + When a thread performs a down operation on a semaphore, the result depends on the + semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. Terminate the wait when the specified timeout expires. + If timeout expires, terminate this wait and grant no access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration); + +/**@ingroup func_qurt_sem_try_down + @xreflabel{hdr:qurt_sem_try_down} + Requests access to a shared resource (without suspend). When a thread performs a try down + operation on a semaphore, the result depends on the semaphore count value: \n + - The count value is decremented when it is nonzero. The down operation returns 0 as + the function result, and the thread gains access to the shared resource and is free to + continue executing.\n + - The count value is not decremented when it is zero. The down operation returns -1 + as the function result, and the thread does not gain access to the shared resource + and should not continue executing. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + 0 -- Success. \n + -1 -- Failure. + + @dependencies + None. + + */ +int qurt_sem_try_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init + Initializes a semaphore object. + The default initial value of the semaphore count value is 1. + + @param[out] sem Pointer to the initialized semaphore object. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_destroy + Destroys the specified semaphore.\n + @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Semaphores must not be destroyed while they are still in use. If this occur, + the behavior of QuRT is undefined. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_sem_destroy(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init_val + Initializes a semaphore object with the specified value. + + @datatypes + #qurt_sem_t + + @param[out] sem Pointer to the initialized semaphore object. + @param[in] val Initial value of the semaphore count value. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val); + +/**@ingroup func_qurt_sem_get_val + Gets the semaphore count value.\n + Returns the current count value of the specified semaphore. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Integer semaphore count value + + @dependencies + None. + */ +static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;} +int qurt_sem_down_cancellable(qurt_sem_t *sem); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SEM_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_shmem.h new file mode 100755 index 0000000000000..980557323708a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_shmem.h @@ -0,0 +1,89 @@ +#ifndef QURT_SHMEM_H +#define QURT_SHMEM_H + +/** + @file qurt_shmem.h + + @brief + Prototypes of QuRT inter-process shared memory APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MODE_T +#define MODE_T +typedef unsigned int mode_t; +#endif //MODE_T + +/** + * The shm_open() function establishes a connection between a shared memory object and a file descriptor. + * The file descriptor is used by other functions such as mmap() to refer to that shared memory object. + * + * + * @param name Pointer to string naming a shared memory object. Name has to start with "/shm/" + * @param oflag File status flags and file access modes of the open file description. Following + * flags are defined in and supported: + * O_RDONLY: oepn for read access only + * O_RDWR: Open for read or write access + * O_CREAT: If shared memory object doesn't exist, create one. + * @param mode Permission flags (currently ignored) + * + * @return file descriptor (positive number) if operation successful. + * negative error code if failed + * +*/ + +int shm_open(const char * name, int oflag, mode_t mode); + +/** + * The shm_mmap() function create a shared memory mapping in the virtual address space of the + * the calling process. + * + * @param addr The starting address for the new mapping is specified in addr. + * @param len Specifies the lengh of the shared memory region. + * @param prot Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX. + * @param flags Determines whether updates to the mapping is visible or not to other process. Same as + * the one in mmap of POSIX. + * @param fd The starting adddress for the new mapping is returned. + * @param offset unused. + * + * @return The starting adddress for the new mapping is returned. + * negative error code if failed + * +*/ + +void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset); + +/** + * The shm_close() function removes a connection between a shared memory object and a file descriptor. + * If there is no file descriptor connects to the shared memory object, the shared memory object will + * be deleted automatically. Shared memory object has same virtual address in any process. This is + * restriction of single virtual address space. + * + * + * @param fd File descriptor of shared memory object + * + * @return 0 if operation successful. + * negative error code if failed + * +*/ + + +int shm_close(int fd); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal.h new file mode 100755 index 0000000000000..3a89c53394ad5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal.h @@ -0,0 +1,518 @@ +#ifndef QURT_SIGNAL_H +#define QURT_SIGNAL_H + +/** + @file qurt_signal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup signals_types +@{ */ +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 /**< Wait any. */ +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 /**< Wait all. */ + +/*===================================================================== + Typedefs + ======================================================================*/ + + +/** QuRT signal type. + */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int signals; + unsigned int waiting; + unsigned int queue; + unsigned int attribute; + }X; + /** @endcond */ +} qurt_signal_t; + + +/** QuRT 64-bit signal type. + */ +typedef struct { + /** @cond */ + qurt_signal_t signal_sum; + unsigned long long signals; + unsigned long long waiting; + /** @endcond */ +} qurt_signal_64_t; +/** @} */ /* end_addtogroup signals_types */ +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal_init + Initializes a signal object. + Signal returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_init(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_destroy + Destroys the specified signal object. + + @note1hang Signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_destroy(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting on the signal. + + If a thread is waiting on a signal object for any of the specified set of signals to set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared when the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread waits to set any of the signals, or to set all of + them. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_timed + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set or until timeout. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared after the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value that identifies the individual signals in the signal object to wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] signals Bitmask of signals that are set + @param[in] duration Duration (microseconds) to wait. Must be in the range + [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION] + + @return + #QURT_EOK -- Success; one or more signals were set \n + #QURT_ETIMEDOUT -- Timed-out \n + #QURT_EINVALID -- Duration out of range + + @dependencies + Timed-waiting support in the kernel. +*/ +/* ======================================================================*/ +int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, unsigned int *signals, unsigned long long int duration); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_any + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on the thread. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_all + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to set the signal, and 0 indicates not to set it. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_set(qurt_signal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 32-bit word with current signals + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_get(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_clear + Clear signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal_wait_cancellable + @xreflabel{hdr:qurt_signal_wait_cancellable} + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, + unsigned int *return_mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_init + Initializes a 64-bit signal object.\n + The signal argument returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore. + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_init(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_destroy + Destroys the specified signal object. + + @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_destroy(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_wait + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value, which identifies the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set it. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifiying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_64_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 64-bit double word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_clear + Clears signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask); + +#ifdef __cplusplus +} +#endif + +#endif /* QURT_SIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal2.h new file mode 100755 index 0000000000000..43975100cbf75 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal2.h @@ -0,0 +1,340 @@ +#ifndef QURT_SIGNAL2_H +#define QURT_SIGNAL2_H + +/** + @file qurt_signal2.h + @brief Prototypes of kernel signal2 API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 + +/*===================================================================== + Typedefs + ======================================================================*/ + +/** @addtogroup signals2_types +@{ */ +/** qurt_signal2 type. + */ +typedef union { + /** @cond */ + struct{ + unsigned int cur_mask; /* Current set of signal bits that are set. */ + unsigned int sig_state; /* Current state. */ + /* Bit 0 -- in anysignal wait. */ + /* Bit 1 -- in allsignal wait. */ + /* Bit 2 -- in interrupt wait. */ + /* Bits 31-3 -- reference count field. */ + unsigned int queue; /* Kernel-maintained futex queue value. */ + unsigned int wait_mask; /* When sig_state indicates a waiter is present, this is the wait mask. */ + }; + unsigned long long int raw; + /** @endcond */ +} qurt_signal2_t; +/* @} */ /* end_addtogroup signals2_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_init + + @deprecated use #qurt_signal_init instead. + + Initializes a signal2 object. + Signal returns the initialized object. + The signal object is initially cleared. + + Objects of type signal2 solve a potential race condition between + set() and destroy() operations. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + Each mutex-based object has an associated + kernel resource(s), therefore users must call qurt_signal2_destroy() + when this object no longer in use. + */ +/* ======================================================================*/ +void qurt_signal2_init(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_destroy + + @deprecated use #qurt_signal_destroy instead. + + Destroys the specified signal object. + + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont Application code should destroy a signal2 object prior to deallocating it. + Calling qurt_signal2_destroy() before deallocating a + signal2 object ensures completion of all qurt_signal2_set() calls. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_destroy(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait + + @deprecated use #qurt_signal_wait instead. + + Suspends the current thread until the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when + any of the signals specified in the mask are set. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only + when all the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to wait on. + @param[in] attribute Specifies whether the thread waits for any of the signals to be set, or for all of + them to be set. Values:\n + - QURT_SIGNAL_ATTR_WAIT_ANY \n + - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_any + + @deprecated use #qurt_signal_wait_any instead. + + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened when any of the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_all + + @deprecated use #qurt_signal_wait_all instead. + + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened only when all the signals specified in the mask are set. + + @note1hang At most one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_set + + @deprecated use #qurt_signal_set instead. + + Sets signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_get + + @deprecated use #qurt_signal_get instead. + + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the signal object to access. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_get(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_clear + + @deprecated use #qurt_signal_clear instead. + + Clear signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal2_wait_cancellable + + @deprecated use #qurt_signal_wait_cancellable instead. + + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +int qurt_signal2_wait_cancellable(qurt_signal2_t *signal, + unsigned int mask, + unsigned int attribute, + unsigned int *p_returnmask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SIGNAL2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_space.h new file mode 100755 index 0000000000000..2c3f9e4496697 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_space.h @@ -0,0 +1,230 @@ +#ifndef QURT_SPACE_H +#define QURT_SPACE_H +/** + @file qurt_space.h + @brief Prototypes of QuRT process control APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** This flag is a request to the OS to suspend the processes just before calling main() +But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */ +#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP + +/** + * Creates and starts a process from ELF of a specified name. The slash symbols + * "/" or "\" are ignored. Do not include the directory name in the input. This function + * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags. + * + * @param name ELF name of the executable. Name shall not contain directories, + * use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf" + * + * @param return + Process ID -- Success \n + Negative error code -- failure\n + #QURT_EPRIVILEGE -- Caller does not have enough privilege for this operation\n + #QURT_EMEM -- Not enough memory to perform the operation \n + #QURT_EFAILED -- Operation failed \n + #QURT_ENOTALLOWED -- Operation not allowed \n + #QURT_ENOREGISTERED -- Not registered \n + #QURT_ENORESOURCE -- Resource exhaustion \n + #QURT_EINVALID -- Invalid argument value +*/ + +int qurt_spawn_flags(const char * name, int flags); + +/** + Creates and starts a process from an ELF of the specified name. The slash symbols + "/" or "\" are ignored. Do not include the directory name in the input. + + @param name ELF name of the executable. Name shall not contain directories, + use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf". + + @return + Process ID -- Success. \m + Negative error code -- Failure. + +*/ +static inline int qurt_spawn(const char *name) +{ + return qurt_spawn_flags(name,0); +} + +/** + * Returns the process ID of the current process. + * + * @return + * Process ID + * +*/ +#define qurt_getpid qurt_process_get_id + +/** + * The qurt_wait() function waits for status change in a child process. It could be used by parent + * process to block on any child process terminates. + * + * This API returns error if there are no user processes or all user processes got detached. + * + * @param status Pointer to status variable. The variable provides the status value of child process. + * The value comes from exit() system call made by child process. + * + * @return + Process ID of the child process that changes status -- Success \n + * Negative error code -- Failure + * +*/ + +int qurt_wait(int *status); + + +/** @cond */ +/* APIs that allow registering callbacks on spawn of user pd */ +typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr); //no return, since we won't be error checking it in spawn +typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info); +typedef union { + QURT_SPAWN_PFN spawn_pfn; + QURT_CB_PFN cb_pfn; +} qurt_process_callback_pfn_t; +/** @endcond */ + +/** @cond internal_only */ + +/**@ingroup func_qurt_event_register +Sets the specified bits by mask in the signal passed by the caller. The signal gets set +when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal +to be set. + +@datatypes + +@param[in] type QURT_PROCESS_EXIT is the only event that can be registered for. +@param[in] value Indicates the client handle of the process for which the event is registered. +@param[in] signal Pointer to the signal object to set when the event occurs. +@param[in] mask Mask bits to set in the signal. +@param[out] data Pointer to the variable that would receive the exit code of the exiting process. +@param[in] datasize Size of the data variable. + +@return +#QURT_EOK -- Success \n +#QURT_EMEM -- Not enough memory to allocate resources \n +#QURT_EVAL -- Invalid values passed to the API + +@dependencies +None. +*/ +int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size); + +/**@ingroup func_qurt_callback_register_onspawn +Allows registering for a callback on spawn of any user process. + +@datatypes +#QURT_SPAWN_PFN + +@param[in] pFn Callback function to call when any user process is spawned. +@param[in] user_data Pointer to the argument that the callback must be called with. + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data); + +/**@ingroup func_qurt_callback_deregister_onspawn +Allows de-registering callback on spawn. + +@param[in] callback_handle Handle returned by qurt_callback_register_onspawn. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_callback_deregister_onspawn(int callback_handle); + +/**@ingroup func_qurt_process_callback_register +Allows registering for a callback during or after image loading. +Generic callback types: + Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is + loaded, before process thread starts. Callback has no return value and has no info provided + from OS. + pFn - QURT_SPAWN_PFN + type - QURT_PROCESS_CB_GENERIC + arg1 - not used + arg2 - not used + arg3 - not used +Note callback types: + Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP), + or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info + argument in callback is populated with pointer to the mapped note corresponding to the callback. + Callback has return value, loader fails if callback returns a value that is not QURT_EOK. + pFn - QURT_CB_PFN + type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP + arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO) + arg2 - note name + arg3 - not used + +@datatypes + +@param[in] pFn Callback function to call +@param[in] type Callback type +@param[in] user_data Pointer to the argument that the callback must be called with. +@param[in] arg1 Arguments interpreted by OS based on callback type +@param[in] arg2 Arguments interpreted by OS based on callback type +@param[in] arg3 Arguments interpreted by OS based on callback type (currently not used) + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, + qurt_process_cb_type_t type, + void *user_data, + qurt_process_callback_arg_t arg1, + qurt_process_callback_arg_t arg2, + qurt_process_callback_arg_t arg3); + + + +/**@ingroup func_qurt_process_callback_deregister +Allows de-registering callback for imate loading. +@param[in] callback_handle Handle returned by qurt_process_callback_register. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_process_callback_deregister(int callback_handle); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SPACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_consts.h new file mode 100755 index 0000000000000..48a8b6a38c402 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_consts.h @@ -0,0 +1,32 @@ +#ifndef QURT_SRM_CONSTS_H +#define QURT_SRM_CONSTS_H +/** + @file qurt_srm_consts.h + @brief Type definitions for srm + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2020-2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +#define QURT_SRM_WAKEUP_REQUEST 1U << 0 /**< Value = 1: Send wakeup request to the SRM server. */ +#define QURT_SRM_SET_HANDLE 1U << 1 /**< Value = 2: Set the client handle for a new SRM client. */ +#define QURT_SRM_ALLOC_KERNEL_PAGES 1U << 2 /**< Value = 4: Allocate pages from the kernel VA space. */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_driver.h new file mode 100755 index 0000000000000..5489e3dddbcca --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_driver.h @@ -0,0 +1,140 @@ +#ifndef QURT_SRM_DRIVER_H +#define QURT_SRM_DRIVER_H +/** + @file qurt_srm_driver.h + @brief Definitions, macros, and prototypes used by SRM drivers. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + + =============================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Define qurt_srm_driver_t structure, which represents +|| the "registration" object for an SRM driver. +*/ +/** @cond internal_only */ +struct _qurt_srm_driver { + const char *name; + qurt_qdi_obj_t *obj; +}; + +typedef struct _qurt_srm_driver qurt_srm_driver_t; + +/* +|| qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke(). +|| It behaves the same, but it takes a QDI object pointer instead of a handle. +*/ + +#define qurt_srm_object_invoke(o,m,...) \ + _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__) +#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c) +#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d)) +#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e)) +#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) + +int qurt_srm_oi3(int, qurt_qdi_obj_t *, int); +int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int); +int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int); +int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int); +int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int); +int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int); +int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int); +int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int); +int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int); +int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int); + +#define QDI_SRM_INIT 192 + +/* +|| QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure. +|| +|| The three arguments are: +|| unique_id -- Unique C identifier, unused but must be a unique global symbol. +|| name -- Name of the driver by which an SRM client attempts to open it. +|| obj -- Pointer to the singleton object of the driver, which handles things such as +|| initialization and QDI_OPEN requests. +*/ + +#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \ + __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \ + { .name = xname, .obj = xobj } + + +/*@ingroup func_qurt_srm_mapping_create + Creates a memory mapping in pagetable with specified attributes + + @param[in] client_handle Client handle representing the process for which + mapping would be created. + @param[in] pageno_virt pointer to the virtual page. NULL indicates SRM + would indicate the virtual memory. + @param[in] pageno_phys physical page to be used for the mapping + @param[in] page_count number of 4k pages to be mapped + @param[in] cache_attr cache attributes for the mapping + @param[in] perm permissions to be used for the mapping + + @return value greater than 0 indicates a handle which can be passed to + qdi_close() to remove the mapping. Negative value indicates + an error. + + @dependencies + None. +*/ +int qurt_srm_mapping_create(int client_handle, + unsigned *pageno_virt, + unsigned pageno_phys, + unsigned page_count, + qurt_mem_cache_mode_t cache_attr, + qurt_perm_t perm); + + +/**@ingroup func_qurt_srm_get_pid + Gets the PID for the client_handle that is passed. + + @param[in] client_handle Client handle for which PID is required. + + @return PID of the client + Negative PID value '-1' will be returned in case of Error + + @dependencies + None. +*/ +unsigned qurt_srm_get_pid(int client_handle); + + +/*@ingroup func_qurt_srm_get_thread_id + Gets the thread id of the client requesting a service from SRM + + @param[in] None. + + @return thead id of client thread + + @dependencies + None. +*/ +qurt_thread_t qurt_srm_get_client_thread_id(void); + +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_DRIVER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_stid.h new file mode 100755 index 0000000000000..379f46aaa4b80 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_stid.h @@ -0,0 +1,73 @@ +#ifndef QURT_STID_H +#define QURT_STID_H +/** + @file qurt_stid.h + Prototypes of software thread identifier(stid) interface APIs. + A stid is 8 bit identifier that can be assigned to a software thread. + The performance monitor logic uses stid as a counting match criteria + for maskable events. stid is also used by the hardware debugger + (ISDB) to match breakpoints. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2024 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_stid_alloc + Allocate a unique stid + + @param[in] pid Process identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - Allocation success + QURT_ENORESOURCE - No stid available for allocation + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_stid_alloc(unsigned int pid, unsigned int *stid); + +/**@ingroup func_qurt_stid_release + Release the stid. + + + @param[in] pid Process identifier + @param[in] stid STID to release + + @note1hang + User shall ensure to clear the released stid from process or thread(s) + to default value (QURT_STID_DEFAULT) before releasing that stid + + @return + QURT_EOK - Release success + QURT_ENOTALLOWED - Operation not allowed for a pid + QURT_EINVALID - Invalid stid + + @dependencies + None. + */ +int qurt_stid_release(unsigned int pid, unsigned int stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_STID_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread.h new file mode 100755 index 0000000000000..499699e7c72e2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread.h @@ -0,0 +1,1260 @@ +#ifndef QURT_THREAD_H +#define QURT_THREAD_H +/** + @file qurt_thread.h + @brief Prototypes of Thread API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2020-2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +#include +#include "qurt_pmu.h" +#include "qurt_api_version.h" +#endif /* __ASSEMBLER__ */ +#include "qurt_consts.h" +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/* + Bitmask configuration to select DSP hardware threads. + To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL + and the following: \n + - For QDSP6 V2/V3, all six hardware threads are selected \n + - For QDSP6 V3L, all four hardware threads are selected \n + - For QDSP6 V4, all three hardware threads are selected + */ + +#define QURT_THREAD_CFG_BITMASK_HT0 0x00000001 /**< HTO. */ +#define QURT_THREAD_CFG_BITMASK_HT1 0x00000002 /**< HT1. */ +#define QURT_THREAD_CFG_BITMASK_HT2 0x00000004 /**< HT2. */ +#define QURT_THREAD_CFG_BITMASK_HT3 0x00000008 /**< HT3. */ +#define QURT_THREAD_CFG_BITMASK_HT4 0x00000010 /**< HT4. */ +#define QURT_THREAD_CFG_BITMASK_HT5 0x00000020 /**< HT5. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{sec:qurt_thread_cfg} */ + +#define QURT_THREAD_CFG_BITMASK_ALL 0x000000ffU /**< Select all the hardware threads. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_CFG_USE_RAM 0x00000000 /**< Use RAM. */ +#define QURT_THREAD_CFG_USE_TCM 0x00000100 /**< Use TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_BUS_PRIO_DISABLED 0 /**< Thread internal bus priority disabled. */ +#define QURT_THREAD_BUS_PRIO_ENABLED 1 /**< Thread internal bus priority enabled. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_AUTOSTACK_DISABLED 0 /**< Thread has autostack v2 feature disabled. */ +#define QURT_THREAD_AUTOSTACK_ENABLED 1 /**< Thread has autostack v2 feature enabled. */ + +/* + Macros for QuRT thread attributes. + */ +#define QURT_HTHREAD_L1I_PREFETCH 0x1 /**< Enables hardware L1 instruction cache prefetching. */ +#define QURT_HTHREAD_L1D_PREFETCH 0x2 /**< Enables hardware L1 data cache prefetching. */ +#define QURT_HTHREAD_L2I_PREFETCH 0x4 /**< Enables hardware L2 instruction cache prefetching. */ +#define QURT_HTHREAD_L2D_PREFETCH 0x8 /**< Enables hardware L2 data cache prefetching. */ +#define QURT_HTHREAD_DCFETCH 0x10 /**< Enables DC fetch to the provided virtual address. + DC fetch indicates the hardware that a data memory access is likely. + Instructions are dropped when there is high bus utilization. */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{hdr:partition_tcm} */ +/* + Below value is used to create legacy QuRT threads by default. + If a thread has this as the detach_state, the thread can be joined + on until it exits. When we are able to change default behavior of all + QuRT threads to JOINABLE (posix default), we can remove this legacy + behavior. +*/ +#define QURT_THREAD_ATTR_CREATE_LEGACY 0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */ +#define QURT_THREAD_ATTR_CREATE_JOINABLE 1U /**< Create a joinable thread. */ +#define QURT_THREAD_ATTR_CREATE_DETACHED 2U /**< Create a detached thread. */ +/** @} */ /* end_addtogroup thread_macros */ + + +#define QURT_THREAD_ATTR_NAME_MAXLEN 16 /**< Maximum name length. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_RAM 0 /**< Creates threads in RAM/DDR. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_TCM 1 /**< Creates threads in TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT QURT_THREAD_ATTR_TCB_PARTITION_RAM /**< Backward compatibility. */ +#define QURT_THREAD_ATTR_PRIORITY_DEFAULT 254 /**< Priority.*/ +#define QURT_THREAD_ATTR_ASID_DEFAULT 0 /**< ASID. */ +#define QURT_THREAD_ATTR_AFFINITY_DEFAULT (-1) /**< Affinity. */ +#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT 255 /**< Bus priority. */ +#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT 0 /**< Default autostack v2 disabled thread. */ +#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT (-2) /**< Timetest ID. */ +#define QURT_THREAD_ATTR_STID_DEFAULT QURT_STID_DEFAULT /**< STID. */ +#define QURT_THREAD_ATTR_STID_ENABLE 1 /**< Indicate to allocate STID during thread creation. */ + +#define QURT_PRIORITY_FLOOR_DEFAULT 255U /**< Default floor. */ +/** @} */ /* end_addtogroup thread_macros */ + +// Option for suspending thread +#define QURT_THREAD_SUSPEND_SYNCHRONOUS 0x0U // bit#0 +#define QURT_THREAD_SUSPEND_ASYNCHRONOUS 0x1U // bit#0 +#define QURT_THREAD_SUSPEND_KEEP_HMX 0x0U // bit#1 +#define QURT_THREAD_SUSPEND_DETACH_HMX 0x2U // bit#1 + +// Option for resuming thread +#define QURT_THREAD_RESUME_DEFAULT 0x0 + +// Thread property IDs +#define QURT_THREAD_PROPERTY_SUSPENDABLE 0x0U +#define QURT_THREAD_PROPERTY_RESUMABLE 0x1 + +// Thread group +#define QURT_THREAD_DEFAULT_GROUP_ID 0x0U +#define QURT_THREAD_GROUP_ID_MASK 0x3FU + +/** @endcond*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup thread_types +@{ */ +/** @cond rest_reg_dist */ +typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */ + +#define CCCC_PARTITION 0U /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */ +#define MAIN_PARTITION 1U /**< Use the main partition. */ +#define AUX_PARTITION 2U /**< Use the auxiliary partition. */ +#define MINIMUM_PARTITION 3U /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */ +/** @endcond */ + +/** Thread ID type. */ +typedef unsigned int qurt_thread_t; + +/** @cond rest_reg_dist */ +/** Thread attributes. */ +typedef struct _qurt_thread_attr { + + char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */ + unsigned char tcb_partition; /**< Indicates whether the thread TCB resides in RAM or + on chip memory (TCM). */ + unsigned char stid; /**< Software thread ID used to configure the stid register + for profiling purposes. */ + unsigned short priority; /**< Thread priority. */ + unsigned char autostack:1; /**< Autostack v2 enabled thread. */ + unsigned char group_id:6; /**< Group ID. */ + unsigned char reserved:1; /**< Reserved bits. */ + unsigned char bus_priority; /**< Internal bus priority. */ + unsigned short timetest_id; /**< Timetest ID. */ + unsigned int stack_size; /**< Thread stack size. */ + void *stack_addr; /**< Pointer to the stack address base. The range of the stack is + (stack_addr, stack_addr+stack_size-1). */ + unsigned short detach_state; /**< Detach state of the thread. */ + +} qurt_thread_attr_t; +/** @endcond */ + +/** @cond rest_reg_dist */ +/** Dynamic TLS attributes. */ +typedef struct qurt_tls_info { + unsigned int module_id; /**< Module ID of the loaded dynamic linked library. */ + unsigned int tls_start; /**< Start address of the TLS data. */ + unsigned int tls_data_end; /**< End address of the TLS RW data. */ + unsigned int tls_end; /**< End address of the TLS data. */ +}qurt_tls_info; +/** @endcond */ + +/** @} */ /* end_addtogroup thread_types */ + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_thread_attr_init + Initializes the structure used to set the thread attributes when a thread is created. + After an attribute structure is initialized, Explicity set the individual attributes in the structure + using the thread attribute operations. + + The initialize operation sets the following default attribute values: \n + - Name -- NULL string \n + - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT + - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n + - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n + - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n + - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n + - stack_size -- 0 \n + - stack_addr -- NULL \n + - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n + - STID -- #QURT_THREAD_ATTR_STID_DEFAULT + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr) +{ + + attr->name[0] = '\0'; + attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT; + attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT; + attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/ + attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT; + attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT; + attr->stack_size = 0; + attr->stack_addr = NULL; + attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY; + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID; +} + +/**@ingroup func_qurt_thread_attr_set_name + Sets the thread name attribute.\n + This function specifies the name to use by a thread. + Thread names identify a thread during debugging or profiling. + Maximum name length is 16 charactes \n + @note1hang Thread names differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] name Pointer to the character string containing the thread name. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name) +{ + strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN); + attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0'; +} + + +/**@ingroup func_qurt_thread_attr_set_tcb_partition + Sets the thread TCB partition attribute. + Specifies the memory type where a TCB of a thread is allocated. + Allocates TCBs in RAM or TCM/LPM. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] tcb_partition TCB partition. Values:\n + - 0 -- TCB resides in RAM \n + - 1 -- TCB resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition) +{ + attr->tcb_partition = tcb_partition; +} + +/**@ingroup func_qurt_thread_attr_set_priority + Sets the thread priority to assign to a thread. + Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing + the highest priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] priority Thread priority. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority) +{ + attr->priority = priority; +} + +/**@ingroup func_qurt_thread_attr_set_detachstate + Sets the thread detach state with which thread is created. + Thread detach state is either joinable or detached; specified by the following values: + - #QURT_THREAD_ATTR_CREATE_JOINABLE \n + - #QURT_THREAD_ATTR_CREATE_DETACHED \n + + When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread + ID and other resources are reclaimed as soon as the thread exits. When a joinable thread + is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some + thread waits to join on it using a qurt_thread_join() call. + By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY + If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other + thread can join before thread exits but it will not wait other thread to join. + + @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very + important that some thread joins on it after it terminates, otherwise + the resources of that thread are not reclaimed, causing memory leaks. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] detachstate Thread detach state. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate) +{ + if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){ + attr->detach_state = detachstate; + } +} + + +/**@ingroup func_qurt_thread_attr_set_timetest_id + Sets the thread timetest attribute.\n + Specifies the timetest identifier to use by a thread. + + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] timetest_id Timetest identifier value. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id) +{ + attr->timetest_id = timetest_id; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute.\n + Specifies the size of the memory area to use for a call stack of a thread. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_size Size (in bytes) of the thread stack. + + @return + None. + + @dependencies + None. +*/ + +static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size) +{ + attr->stack_size = stack_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size2 + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size + defined in the configuration XML.\n + Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] user_stack_size Size (in bytes) of the stack usage in User mode. + @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size) +{ + union qurt_thread_stack_info{ + unsigned int raw_size; + struct{ + unsigned short user_stack; + unsigned short root_stack; + }; + }user_root_stack_size; + user_root_stack_size.user_stack = user_stack_size; + user_root_stack_size.root_stack = root_stack_size; + + attr->stack_size = user_root_stack_size.raw_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_addr + @xreflabel{sec:set_stack_addr} + Sets the thread stack address attribute. \n + Specifies the base address of the memory area to use for a call stack of a thread. + + stack_addr must contain an address value that is 8-byte aligned. + + The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a + call stack for the thread. \n + @note1hang The user is responsible for allocating the memory area used for the thread + stack. The memory area must be large enough to contain the stack that the thread + creates. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_addr Pointer to the 8-byte aligned address of the thread stack. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr) +{ + attr->stack_addr = stack_addr; +} + +/**@ingroup func_qurt_thread_attr_set_bus_priority + Sets the internal bus priority state in the Hexagon core for this software thread attribute. + Memory requests generated by the thread with bus priority enabled are + given priority over requests generated by the thread with bus priority disabled. + The default value of bus priority is disabled. + + @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. + The priority is not propagated to the bus fabric. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + + @param[in] bus_priority Enabling flag. Values: \n + - #QURT_THREAD_BUS_PRIO_DISABLED \n + - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority) +{ + attr->bus_priority = (unsigned char)bus_priority; +} + +/**@ingroup func_qurt_thread_attr_set_autostack + Enables autostack v2 feature in the thread attributes. + + When autostack is enabled by the subsystem, in the case that + an autostack enabled thread gets framelimit exception, kernel will + allocate more stack for thread and return to normal execution. + + If autostack is not enabled by the subsystem, or it is not enabled + for the thread, the framelimit exception will be fatal. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] autostack Autostack enable or disable flag. Values: \n + - #QURT_THREAD_AUTOSTACK_DISABLED \n + - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack) +{ + attr->autostack = (unsigned char)autostack; +} +/**@ingroup qurt_thread_attr_enable_stid + Set STID in the thread attributes. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] enable_stid STID to be set. Values: \n + - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n + - #QURT_THREAD_ATTR_STID_ENABLE (1): QuRT assigns an STID that is not already in use \n + - #2 through #255 : User provided STID. @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid) +{ + if (enable_stid != '\0') { + attr->stid = enable_stid; + } + else + { + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + } +} + +/**@ingroup func_qurt_thread_attr_set_stid + Sets the stid thread attribute. + The default stid value is QURT_THREAD_ATTR_STID_DEFAULT + + @note1hang When a thread is created with non default stid , + the stid set in thread attribute will be assigned to a thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] stid Stid to be set for a thread. + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){ + attr->stid = stid; +} + +/**@ingroup func_qurt_thread_attr_set_group_id + Sets group id in the thread attributes. + Primordial/first thread has group ID 0. + If a new thread is created without assigning group_id, it + inherits the group ID from its parent thread. + + @note1hang + 1) Group ID can only be set before creating a thread. It cannot be + changed after the thread is created. + 2) If a non-activated group_id is passed, thread creation will fail. + 3) Only a thread with Group ID #0 can set Group ID for its child threads. + 4) If thread with non-zero group ID set the group ID for its child threads, + QuRT will ingore this parameter and child threads will inherit the parent + thread's group ID. But if passed group ID is not activated, thread creation + will still fail. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] group_id Group identifier. Its valid range is 0 ~ 63 + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id) +{ + attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK; +} + +/**@ingroup func_qurt_thread_set_autostack + Sets autostack enable in the TCB. + + @param[in] Pointer to UGP + + @return + None. + + @dependencies + None. +*/ + +void qurt_thread_set_autostack(void *); + + +/**@ingroup func_qurt_thread_get_name + Gets the thread name of current thread.\n + Returns the thread name of the current thread. + Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names + identify a thread during debugging or profiling. + + @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored. + @param[in] max_len Maximum length of the character string that can be returned. + + @return + None. + + @dependencies + None. +*/ +void qurt_thread_get_name (char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_create + @xreflabel{hdr:qurt_thread_create} + Creates a thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + @param[in] entrypoint C function pointer, which specifies the main function of a thread. + @param[in] arg Pointer to a thread-specific argument structure + + + @return + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg); + +/**@ingroup func_qurt_thread_stop + Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. + + @return + void + + @dependencies + None. + */ +void qurt_thread_stop(void); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_resume + When a demand-loading paging solution is enabled, this function + will resumes the execution of a thread that was suspended due to + a page miss. + + @param[in] thread_id Thread identifier. + + @return + #QURT_EOK -- Thread successfully resumed. \n + #QURT_EFATAL -- Resume operation failed. + + @dependencies + None. + */ +int qurt_thread_resume(unsigned int thread_id); +/** @endcond */ + +/**@ingroup func_qurt_thread_get_id + Gets the identifier of the current thread.\n + Returns the thread identifier for the current thread. + + @return + Thread identifier -- Identifier of the current thread. + + @dependencies + None. + */ +qurt_thread_t qurt_thread_get_id (void); + + +/**@ingroup func_qurt_thread_get_l2cache_partition + Returns the current value of the L2 cache partition assigned to the caller thread.\n + + @return + Value of the #qurt_cache_partition_t data type. + + @dependencies + None. + */ +qurt_cache_partition_t qurt_thread_get_l2cache_partition (void); + +/**@ingroup func_qurt_thread_set_timetest_id + Sets the timetest identifier of the current thread. + Timetest identifiers are used to identify a thread during debugging or profiling.\n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @param[in] tid Timetest identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_timetest_id (unsigned short tid); + +/**@ingroup func_qurt_thread_set_cache_partition + Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type + to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache. + + @datatypes + #qurt_cache_partition_t + + @param[in] l1_icache L1 I cache partition. + @param[in] l1_dcache L1 D cache partition. + @param[in] l2_cache L2 cache partition. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache); + + +/**@ingroup func_qurt_thread_get_timetest_id + Gets the timetest identifier of the current thread.\n + Returns the timetest identifier of the current thread.\n + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @return + Integer -- Timetest identifier. + + @dependencies + None. + */ +unsigned short qurt_thread_get_timetest_id (void); + +/**@ingroup func_qurt_thread_exit + @xreflabel{sec:qurt_thread_exit} + Stops the current thread, awakens threads joined to it, then destroys the stopped + thread. + + Threads that are suspended on the current thread (by performing a thread join + Section @xref{sec:thread_join}) are awakened and passed a user-defined status value + that indicates the status of the stopped thread. + + @note1hang Exit must be called in the context of the thread to stop. + + @param[in] status User-defined thread exit status value. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_exit(int status); + +/**@ingroup func_qurt_thread_join + @xreflabel{sec:thread_join} + Waits for a specified thread to finish; the specified thread is another thread within + the same process. + The caller thread is suspended until the specified thread exits. When the unspecified thread + exits, the caller thread is awakened. \n + @note1hang If the specified thread has already exited, this function returns immediately + with the result value #QURT_ENOTHREAD. \n + @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish. + If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}). + + @param[in] tid Thread identifier. + @param[out] status Destination variable for thread exit status. Returns an application-defined + value that indicates the termination status of the specified thread. + + @return + #QURT_ENOTHREAD -- Thread has already exited. \n + #QURT_EOK -- Thread successfully joined with valid status value. + + @dependencies + None. + */ +int qurt_thread_join(unsigned int tid, int *status); + +/**@ingroup qurt_thread_detach + @xreflabel{sec:thread_detach} + Detaches a joinable thread. The specified thread is another thread within the + same process. Create the thread as a joinable thread; only joinable threads + can be detached. + If a joinable thread is detached, it finishes execution and exits. + + @param[in] tid Thread identifier. + + @return + #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n + #QURT_EOK -- Thread successfully detached. + + @dependencies + None. + */ +int qurt_thread_detach(unsigned int tid); + + +/**@ingroup func_qurt_thread_get_priority + Gets the priority of the specified thread. \n + Returns the thread priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. \n + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + + @return + -1 -- Invalid thread identifier. \n + 1 through 254 -- Thread priority value. + + @dependencies + None. + */ +int qurt_thread_get_priority (qurt_thread_t threadid); + +/**@ingroup func_qurt_thread_set_priority + Sets the priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. For more + information, see Section @xref{sec:AppDev}. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + @param[in] newprio New thread priority value. + + @return + 0 -- Priority successfully set. \n + -1 -- Invalid thread identifier. \n + + @dependencies + None. + */ +int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio); + + + +/**@ingroup func_qurt_thread_attr_get + Gets the attributes of the specified thread. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[out] attr Pointer to the destination structure for thread attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid argument. + + @dependencies + None. + */ +int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr); + + + +/**@ingroup func_qurt_thread_get_tls_base + Gets the base address of thread local storage (TLS) of a dynamically loaded module + for the current thread. + + @datatypes + #qurt_tls_info + + @param[in] info Pointer to the TLS information for a module. + + @return + Pointer to the TLS object for the dynamically loaded module.\n + NULL -- TLS information is invalid. + + @dependencies + None. + */ +void * qurt_thread_get_tls_base(qurt_tls_info* info); + +/**@ingroup func_qurt_thread_pktcount_get + Gets the PKTCOUNT of a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + PKTCOUNT + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_get (qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_pktcount_set + Sets the PKTCOUNT for the current QuRT thread. + + @return + Value to which pktcount is set. + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_set (long long int); + +/**@ingroup func_qurt_thread_stid_get + Gets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + STID + + @dependencies + None. + */ + +char qurt_thread_stid_get(qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_stid_get2 + Returns the set stid for a thread + + @param[in] thread_id thread identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - success + QURT_ENOTALLOWED - operation not allowed for a thread + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid); + +/**@ingroup func_qurt_thread_stid_set + Sets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] stid Thread identifier. + + @return + #QURT_EOK -- STID set created. \n + #QURT_EFAILED -- STID not set. + + @dependencies + None. + */ + +int qurt_thread_stid_set(char stid); + +/**@ingroup qurt_thread_stid_set2 + Sets the stid for a specified thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[in] stid Stid to be set for a thread. + + @return + QURT_EOK -- Success + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_EVAL -- Failure because of invalid inputs. + + @dependencies + None. +*/ +int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_get_running_ids + Returns the thread IDs of the running threads in the system; use only during fatal error handling. + + @datatypes + #qurt_thread_t + + @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1. + + @return + #QURT_EINVALID -- Incorrect argument \n + #QURT_ENOTALLOWED -- API not called during error handling \n + #QURT_EOK -- Success, returns a NULL-terminated array of thread_id + + @dependencies + None. + */ +int qurt_thread_get_running_ids(qurt_thread_t *); +/** @endcond */ + + +/**@ingroup func_qurt_thread_get_thread_id + Gets the thread identifier of the thread with the matching name in the same process + of the caller. + + @datatypes + #qurt_thread_t + + @param[out] thread_id Pointer to the thread identifier. + @param[in] name Pointer to the name of the thread. + + @return + #QURT_EINVALID -- No thread with matching name in the process of the caller \n + #QURT_EOK -- Success + + @dependencies + None. + */ +int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name); + +/**@ingroup func_qurt_sleep + Suspends the current thread for the specified amount of time. + + @note1hang Because QuRT timers are deferrable, this call is guaranteed to block + at least for the specified amount of time. If power-collapse is + enabled, the maximum amount of time this call can block depends on + the earliest wakeup from power-collapse past the specified duration. + + @param[in] duration Duration (in microseconds) for which the thread is suspended. + + @return + None. + + @dependencies + None. + */ +void qurt_sleep (unsigned long long int duration); + + +/**@ingroup func_qurt_system_set_priority_floor + Sets a priority floor to move threads with thread priority lower than the floor out of the running state. + Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they + are not scheduled to run when the thread priority is lower than the floor. + Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. + Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor. + + The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and + sets a new floor, the new floor is associated to its original user process, not the QuRTOS process. + The floor associated to the user process is reset when the user process exits or is killed, but not at the time + when the user thread of the caller exits. + + The priority floor cannot be set to a priority higher than the thread priority of the caller. + + The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor. + + This function is not supported in Island mode. + + After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task + is not scheduled to run. + + @param[in] priority_floor Priority floor. + + @return + #QURT_EOK -- Success \n + #QURT_ENOTALLOWED -- Floor setting is not allowed + + @dependencies + None. + */ +int qurt_system_set_priority_floor (unsigned int priority_floor); + + +/**@ingroup func_qurt_thread_suspend_thread + Suspend a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent process. + After the target thread is suspended, the kernel will not schedule it to run until it is resumed later. + + If the target thread is set as non-suspendable, this function call returns an error without suspending + the target thread. + + If the target thread is already suspended, this function call returns success to confirm + the target thread suspend. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + suspending the target thread. + + If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend + the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is + suspended when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target + thread can runn in the guest OS, and is suspended when exiting the guest OS. + + QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend + those threads. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, multiple options can be ORed. \n + #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call, + the function returns after the thread is completely suspended.\n + #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns + after the kernel acts to suspend the target thread. The target thread + might still be running before it is completely suspended. \n + #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread + if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n + #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock(). + Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only + supported for the caller from the same user process of the target thread, not for a caller from the parent + process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX + context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations + and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option. + If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this + case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended + state without HMX detached. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process. + #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread. + + @dependencies + None. + */ +int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_resume_thread + Resume a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent + process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on + the thread priority. + + There is an option argument in this function, with only one default option as of now, + QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way. + + By default, this is an asynchronous function. The function returns after kernel moves the + target thread from suspended state to runnable state. The thread is scheduled to run based on its + thread priority. + + If the target thread is set as non-resumable, this function call does not resume the target thread. + + If the target thread has already resumed, this function confirms that the target thread resumes + by returning success. + + If the target thread is in a secure user process or CPZ process, this function call returns an error without + resuming the operation. + + If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of + suspend-pending on the target thread, and the target thread is not suspended when it exits the + guest OS. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + #QURT_EHMXNOTAVAIL -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume. + + @dependencies + None. + */ +int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_set_thread_property + Set a QuRT thread property with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be from the same user process of the target thread, or from its parent process. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + changing the property of the target thread. + + @param[in] thread_id Thread identifier \n + @param[in] property_id Thread property identifier \n + #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n + #QURT_THREAD_PROPERTY_RESUMEABLE -- thread is resumable. Default is TRUE + @param[in] value Proper value: \n + TRUE(1) -- TRUE for the property \n + FALSE(0) -- FALSE for the property + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value ); + +/**@ingroup func_qurt_thread_get_group_id + Get the group id of the thread specified by thread_id.\n + + @param[in] thread_id Thread identifier + @param[out] group_id Pointer to the variable of group identifier + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Thread id is invalid, or the process has no groups enabled \n + #QURT_ENOTALLOWED -- Operation is not allowed \n + + @dependencies + None. +*/ +int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id); + +#endif /* __ASSEMBLER__ */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread_context.h new file mode 100755 index 0000000000000..bab09deec8889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread_context.h @@ -0,0 +1,234 @@ +#ifndef QURT_THREAD_CONTEXT_H +#define QURT_THREAD_CONTEXT_H +/** + @file qurt_thread_context.h + @brief Kernel thread context structure + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond internal_only */ + +#define THREAD_ITERATOR_END ((qurt_thread_t)(-1)) /**< Thread iterator is complete. */ + + +/**@ingroup func_qurt_thread_iterator_create +Gives the ability to the caller to enumerate threads in the system. + +@return +Handle of the newly created iterator must be passed for +subsequent operations on the iterator. + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_create(void) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE); +} + +/**@ingroup func_qurt_thread_iterator_next +Iterates over the list of threads in the system. + +@datatypes +#qurt_thread_t + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n +Other values indicate a valid thread_id. + +@dependencies +None. +*/ +static inline qurt_thread_t qurt_thread_iterator_next(int iter) +{ + return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT); +} + +/**@ingroup func_qurt_thread_iterator_destroy +Cleans up thread iterator resources. + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#QURT_EOK -- Successful completion of operation \n +#QURT_EFATAL -- Invalid handle passed + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_destroy(int iter) +{ + return qurt_qdi_close(iter); +} + +/**@ingroup func_qurt_thread_context_get_tname +Gets the name of the thread from the specified thread ID. + +@param[in] thread_id Thread for which name is returned. +@param[in,out] name Pointer to the local buffer where name is copied back. +@param[in] max_len Size of the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_context_get_prio +Gets the priority for the specified thread. + +@param[in] thread_id Thread for which priority is returned. +@param[in,out] prio Pointer to the local variable where priority is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio); + +/**@ingroup func_qurt_thread_context_get_pcycles +Gets pcycles for the specified thread. + +@param[in] thread_id Thread for which processor cycles are returned. +@param[in,out] pcycles Pointer to the local variable where processor cycles are written. + +@return +#QURT_EOK -- Success \n +Failure otherwise. + +@dependencies +None. +*/ +int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles); + +/**@ingroup func_qurt_thread_context_get_stack_base +Gets the stack base address for the specified thread. + +@param[in] thread_id Thread for which stack base address is returned. +@param[in,out] sbase Pointer to the local variable where stack base address is written. + +@return +QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase); + +/**@ingroup func_qurt_thread_context_get_stack_size +Gets the stack size for the specified thread. + +@param[in] thread_id Thread for which stack size is returned. +@param[in,out] ssize Pointer to the local variable where stack size is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize); + +/**@ingroup func_qurt_thread_context_get_pid +Gets the process ID for the specified thread. + +@param[in] thread_id Thread for which process ID is returned. +@param[in,out] pid Pointer to the local variable where process id is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid); + +/**@ingroup func_qurt_thread_context_get_pname +Gets the process name for the specified thread. + +@param[in] thread_id Represents the thread for which process name is returned. +@param[in, out] name Pointer to the local buffer where process name is copied back. +@param[in] len Length allocated to the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len); + +/** @addtogroup thread_types +@{ */ +/** Structure that defines how TCB is interpreted to crash dump tools.*/ +/* Keys are defined in consts.h */ +struct qurt_debug_thread_info { +/** @cond */ + char name[QURT_MAX_NAME_LEN]; /**< Name of the thread. */ + struct { + unsigned key; + unsigned val; + } os_info[40]; + unsigned gen_regs[32]; /**< General mode registers. */ + unsigned user_cregs[32]; /**< User mode registers. */ + unsigned guest_cregs[32]; /**< Guest mode registers. */ + unsigned monitor_cregs[64]; /**< Monitor mode registers. */ +/** @endcond */ +}; /* should add up to 1K */ +/** @} */ /* end_addtogroup thread_types */ + + +/**@ingroup func_qurt_system_tcb_dump_get +Cleans up thread iterator resources. + +@datatypes +#qurt_thread_t + +@param[in] thread_id Thread on which the operation must be performed. +@param[in, out] ptr Pointer to the local buffer where contents are written. +@param[in] size Size of the debug thread information structure obtained by calling + qurt_system_tcb_dump_get_size(). + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_CONTEXT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_timer.h new file mode 100755 index 0000000000000..7bdfdb8f3c3df --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_timer.h @@ -0,0 +1,560 @@ +#ifndef QURT_TIMER_H +#define QURT_TIMER_H +/** + @file qurt_timer.h + @brief Prototypes of qurt_timer API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include "qurt_anysignal.h" +#include "qurt_signal2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@addtogroup timer_const_macros +@{ */ +/** + Default values. +*/ +/** @xreflabel{hdr:QURT_TIMER_ONESHOT}*/ +#define QURT_TIMER_DEFAULT_TYPE QURT_TIMER_ONESHOT /**< One shot.*/ +#define QURT_TIMER_DEFAULT_DURATION 1000uL /**< Default duration. */ +#define QURT_TIMER_DEFAULT_EXPIRY 0uL /**< Default expiration. */ + +/** + Conversion from microseconds to timer ticks. + */ +#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** Minimum microseconds value is 100 microseconds (sleep timer).*/ +#define QURT_TIMER_MIN_DURATION 100uL + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_TIMER_MAX_DURATION QURT_SYSCLOCK_MAX_DURATION + +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS + +/** + Sleep timer error margin for Qtimer is 1,000 ticks ~52 us. +*/ +#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN + +/* + qurt_timer group defines. +*/ +#define QURT_TIMER_MAX_GROUPS 5U /**< Maximum groups.*/ +#define QURT_TIMER_DEFAULT_GROUP 0U /**< Default groups. */ +/** @} */ /* end_addtogroup timer_const_macros */ + +/** @addtogroup timer_types +@{ */ +/** + QuRT timer types. + */ +typedef enum +{ + QURT_TIMER_ONESHOT = 0, /**< One shot.*/ + /** @xreflabel{hdr:QURT_TIMER_PERIODIC}*/ + QURT_TIMER_PERIODIC /**< Periodic. */ +} qurt_timer_type_t; + + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT timer type.*/ +typedef unsigned int qurt_timer_t; + +/** QuRT timer duration type. */ +typedef unsigned long long qurt_timer_duration_t; + +/** QuRT timer time type. */ +typedef unsigned long long qurt_timer_time_t; + +typedef void (*pfn_t)(void); +/** QuRT timer attribute type. */ +typedef struct +{ + /** @cond */ + unsigned int magic; /**< Magic number to verify the qmsgq_attr_t pointer. */ + + qurt_timer_duration_t duration; /**< Specifies the duration of the new timer. */ + + qurt_timer_time_t expiry; /**< Specifies the absolute expiry of the new timer. */ + + qurt_timer_duration_t remaining; /**< Specifies the remaining time of an active timer. */ + + qurt_timer_type_t type; /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and + #QURT_TIMER_PERIODIC are supported. */ + + unsigned int group; /**< Group number of the timer; the criterion used to disable or enable the set + of timers. */ + pfn_t pFn; /**< Callback other than the signal set */ + /** @endcond */ +} +qurt_timer_attr_t; + +/** @} */ /* end_addtogroup timer_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_timer_stop + @xreflabel{sec:qurt_timer_stop} + Stops a running timer. + The timer must be a one-shot timer. + + @note1hang Restart stopped timers with the timer restart operation, + see Section @xref{sec:qurt_timer_restart}. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n + #QURT_EMEM -- Out of memory error. + + @dependencies + None. + */ +int qurt_timer_stop (qurt_timer_t timer); + +/**@ingroup func_qurt_timer_restart + @xreflabel{sec:qurt_timer_restart} + Restarts a stopped timer with the specified duration. The timer must be a one-shot timer. + Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop(). + A restarted timer expires after the specified duration, the starting time is when the function is called. + + @note1hang Timers stop after they have expired or after they are explicitly + stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}. + + @datatypes + #qurt_timer_t \n + #qurt_timer_duration_t + + @param[in] timer Timer object. + @param[in] duration Timer duration (in microseconds) before the restarted timer + expires again. + The valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n + #QURT_EMEM -- Out-of-memory error. + + @dependencies + None. + */ +int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration); + + +/**@ingroup func_qurt_timer_create + Creates a timer.\n + Allocates and initializes a timer object, and starts the timer. + + @note1hang A timer event handler must be defined to wait on the specified signal + to handle the timer event. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t \n + #qurt_anysignal_t + + @param[out] timer Pointer to the created timer object. + @param[in] attr Pointer to the timer attribute structure. + @param[in] signal Pointer to the signal object set when timer expires. + @param[in] mask Signal mask, which specifies the signal to set in the signal object when the + time expires. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to create the timer. \n + #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n + Other error code -- Operation failed. \n + + @dependencies + None. + */ +int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_anysignal_t *signal, unsigned int mask); + +int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_timer_attr_init + Initializes the specified timer attribute structure with default attribute values: \n + - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n + - Timer type -- #QURT_TIMER_ONESHOT \n + - Timer group -- #QURT_TIMER_DEFAULT_GROUP + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_init(qurt_timer_attr_t *attr); + + +/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020 +@ingroup func_qurt_timer_attr_set_pfn + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + @param[in] pFn pFn. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn); + + +/**@ingroup func_qurt_timer_attr_set_duration + Sets the timer duration in the specified timer attribute structure.\n + + The timer duration specifies the interval (in microseconds) between the creation of the + timer object and the generation of the corresponding timer event. + + The timer duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] duration Timer duration (in microseconds). + Valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_attr_set_expiry + Sets the absolute expiry time in the specified timer attribute structure.\n + The timer expiry specifies the absolute time (in microseconds) of the generation of the + corresponding timer event.\n + Timer expiries are relative to when the system first began executing. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_time_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] time Timer expiry. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time); + +/**@ingroup func_qurt_timer_attr_get_duration + Gets the timer duration from the specified timer attribute structure. + The value returned is the duration that was originally set for the timer. + + @note1hang This function does not return the remaining time of an active timer; + use qurt_timer_attr_get_remaining() to get the remaining time. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attributes object + @param[out] duration Pointer to the destination variable for timer duration. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration); + +/**@ingroup func_qurt_timer_attr_get_remaining + Gets the timer remaining duration from the specified timer attribute structure. \n + + The timer remaining duration indicates (in microseconds) how much time remains before + the generation of the next timer event on the corresponding timer. + In most cases this function assumes that the timer attribute structure was obtained by + calling qurt_timer_get_attr(). + + @note1hang This attribute is read-only and thus has no set operation defined for it. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attribute object. + @param[out] remaining Pointer to the destination variable for remaining time. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining); + +/**@ingroup func_qurt_timer_attr_set_type + Sets the timer type in the specified timer attribute structure. + + The timer type specifies the functional behavior of the timer: \n + - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration + and then generates a single timer event. After this the timer is nonfunctional. \n + - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified + timer duration and then generates a timer event. The result is a series of timer + events with interval equal to the timer duration. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] type Timer type. Values are: \n + - #QURT_TIMER_ONESHOT -- One-shot timer. \n + - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type); + +/**@ingroup func_qurt_timer_attr_get_type + Gets the timer type from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] type Pointer to the destination variable for the timer type. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type); + +/**@ingroup func_qurt_timer_attr_set_group + Sets the timer group identifier in the specified timer attribute structure.\n + The timer group identifier specifies the group that the timer belongs to. Timer groups are + used to enable or disable one or more timers in a single operation. \n + The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1). + See Section @xref{dox:timers}. + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the timer attribute object. + @param[in] group Timer group identifier; + Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1). + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group); + +/**@ingroup func_qurt_timer_attr_get_group + Gets the timer group identifier from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] group Pointer to the destination variable for the timer group identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group); + +/**@ingroup func_qurt_timer_get_attr + @xreflabel{hdr:qurt_timer_get_attr} + Gets the timer attributes of the specified timer when it was created. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t + + @param[in] timer Timer object. + @param[out] attr Pointer to the destination structure for timer attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr); + +/**@ingroup func_qurt_timer_delete + Deletes the timer.\n + Destroys the specified timer and deallocates the timer object. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_delete(qurt_timer_t timer); + +/**@ingroup func_qurt_timer_sleep + Suspends the current thread for the specified amount of time. + The sleep duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). + + @datatypes + #qurt_timer_duration_t + + @param[in] duration Interval (in microseconds) between when the thread is suspended + and when it is re-awakened. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to perform the operation. + + @dependencies + None. + */ + +int qurt_timer_sleep(qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_group_disable + Disables all timers that are assigned to the specified timer group. + If a specified timer is already disabled, ignore it. + If a specified timer is expired, do not process it. + If the specified timer group is empty, do nothing. + + @note1hang When a timer is disabled its remaining time does not change, thus it + cannot generate a timer event. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_disable (unsigned int group); + +/**@ingroup func_qurt_timer_group_enable + Enables all timers that are assigned to the specified timer group. + If a specified timer is already enabled, ignore it. + If a specified timer is expired, process it. + If the specified timer group is empty, do nothing. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_enable (unsigned int group); + + +/** + Notifies the timer server recovery from power collapse. The server + must account for any missed interrupts during power collapse. + */ +void qurt_timer_recover_pc (void); + +/** + Determines whether the Qtimer is initialized. + + @return + 0 -- Not initialized. \n + Nonzero -- Initialized. + */ +static inline int qurt_timer_is_init (void) {return 1;} + +/**@ingroup func_qurt_timer_get_ticks + Gets current ticks. The ticks are accumulated since the RTOS + has started. Each tick is equal to a single timer clock + cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer. + + @return + Ticks since system started. + */ +unsigned long long qurt_timer_get_ticks (void); + +#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TIMER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tlb.h new file mode 100755 index 0000000000000..b1b2d261d31c0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tlb.h @@ -0,0 +1,215 @@ +#ifndef QURT_TLB_H +#define QURT_TLB_H + +/** + @file qurt_tlb.h + @brief Prototypes of TLB API + The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. + Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed + by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. + In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently + assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. + A new entry is placed in the first available slot. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tlb_entry_create + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (such as if the address is not aligned with the + size), the entry is created and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr Physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry is not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_create_64 + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (the address is not aligned with the + size), the entry is not created, and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the asid argument to -1. + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr_64 64-bit physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry was not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_delete + Deletes the specified TLB entry from the TLB of the Hexagon processor. + If the specified entry does not exist, no deletion occurs and an error result is returned. + + @param[in] entry_id TLB entry identifier. + + @return + #QURT_EOK -- TLB entry successfully deleted. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_delete (unsigned int entry_id); + +/**@ingroup func_qurt_tlb_entry_query + Searches for the specified TLB entry in the TLB of the Hexagon processor. + If the TLB entry is found, its entry identifier is returned. + + @datatypes + #qurt_addr_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid); + +/**@ingroup func_qurt_tlb_entry_set + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[in] entry 64-bit TLB entry to store. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry); + +/**@ingroup func_qurt_tlb_entry_get + Gets the TLB entry. \n + Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[out] entry 64-bit TLB entry. + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry); + +/**@ingroup func_qurt_tlb_get_pager_physaddrs + Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_phys_addrs Pointer to the return array of pager physical addresses. + + @return + Integer -- Number of addresses returned in array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs); + +/**@ingroup func_qurt_tlb_get_pager_virtaddr + Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_virt_addrs Pointer to the return array of pager virtual addresses. + + @return + Integer -- Number of addresses returned in the array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs); + + +/**@ingroup func_qurt_tlb_entry_set2 + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. An additional option can be passed + to lock the TLB entry in the TLB of the Hexagon processor. + + @param[in] id TLB entry identifier. + @param[in] tlb 64-bit TLB entry to store. + @param[in] lock Nonzero value indicates that the TLB entry must be locked in the hardware TLB. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLB_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tls.h new file mode 100755 index 0000000000000..6ec3b39ff5cb0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tls.h @@ -0,0 +1,100 @@ +#ifndef QURT_TLS_H +#define QURT_TLS_H +/** + @file qurt_tls.h + @brief Prototypes of TLS APIs + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tls_create_key + @xreflabel{sec:tls_create_key} + Creates a key for accessing a thread local storage data item.\n + Subsequent get and set operations use the key value. + + @note1hang The destructor function performs any clean-up operations needed by a thread + local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}). + + @param[out] key Pointer to the newly created thread local storage key value. + @param[in] destructor Pointer to the key-specific destructor function. Passing NULL + specifies that no destructor function is defined for the key. + + @return + #QURT_EOK -- Key successfully created. \n + #QURT_ETLSAVAIL -- No free TLS key available. + + @dependencies + None. + */ +int qurt_tls_create_key (int *key, void (*destructor)(void *)); + +/**@ingroup func_qurt_tls_set_specific + Stores a data item to thread local storage along with the specified key. + + @param[in] key Thread local storage key value. + @param[in] value Pointer to user data value to store. + + @return + #QURT_EOK -- Data item successfully stored. \n + #QURT_EINVALID -- Invalid key. \n + #QURT_EFAILED -- Invoked from a non-thread context. + */ +int qurt_tls_set_specific (int key, const void *value); + +/**@ingroup func_qurt_tls_get_specific + Loads the data item from thread local storage. \n + Returns the data item that is stored in thread local storage with the specified key. + The data item is always a pointer to user data. + + @param[in] key Thread local storage key value. + + @return + Pointer -- Data item indexed by key in thread local storage. \n + 0 (NULL) -- Key out of range. + + @dependencies + None. + */ +void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key); + + +/**@ingroup func_qurt_tls_delete_key + Deletes the specified key from thread local storage. + + @note1hang Explicitly deleting a key does not execute any destructor function that is + associated with the key (Section @xref{sec:tls_create_key}). + + @param[in] key Thread local storage key value to delete. + + @return + #QURT_EOK -- Key successfully deleted. \n + #QURT_ETLSENTRY -- Key already free. + + @dependencies + None. + */ +int qurt_tls_delete_key (int key); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_trace.h new file mode 100755 index 0000000000000..541f8f1d34bf6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_trace.h @@ -0,0 +1,317 @@ +#ifndef QURT_TRACE_H +#define QURT_TRACE_H +/** + @file qurt_trace.h + @brief Prototypes of system call tracing helpers API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + GLOBAL VARIABLES +=============================================================================*/ +/** @cond internal_only */ +/** @addtogroup etm_macros +@{ */ +/* ETM trace types. */ +#define QURT_ETM_TYPE_PC_ADDR (1U<<0) /**< PC address.*/ +#define QURT_ETM_TYPE_MEMORY_ADDR (1U<<1) /**< Memory address. */ +#define QURT_ETM_TYPE_TESTBUS (1U<<2) /**< Test bus. */ +#define QURT_ETM_TYPE_CYCLE_ACCURATE (1U<<3) /**< Cycle accurate. */ +#define QURT_ETM_TYPE_CYCLE_COARSE (1U<<4) /**< Cycle coarse. */ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */ +#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */ +#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */ + +/* ETM routes. */ +#define QURT_ETM_ROUTE_TO_QDSS 0U /**< ETM route to QDSS. */ +#define QURT_ETM_ROUTE_TO_Q6ETB 1U /**< ETM route to Q6ETB. */ + +/* ETM filters. */ +#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT 0U /*< Filter all as default. */ +#define QURT_ETM_TRACE_FILTER_HNUM0 (1U<<0) /*< Filter HNUM0. */ +#define QURT_ETM_TRACE_FILTER_HNUM1 (1U<<1) /*< Filter HNUM1. */ +#define QURT_ETM_TRACE_FILTER_HNUM2 (1U<<2) /*< Filter HNUM2. */ +#define QURT_ETM_TRACE_FILTER_HNUM3 (1U<<3) /*< Filter HNUM3. */ +#define QURT_ETM_TRACE_FILTER_HNUM4 (1U<<4) /*< Filter HNUM4. */ +#define QURT_ETM_TRACE_FILTER_HNUM5 (1U<<5) /*< Filter HNUM5. */ +#define QURT_ETM_TRACE_FILTER_HNUM6 (1U<<6) /*< Filter HNUM6. */ +#define QURT_ETM_TRACE_FILTER_HNUM7 (1U<<7) /*< Filter HNUM7. */ +#define QURT_ETM_TRACE_FILTER_HNUM8 (1U<<8) /*< Filter HNUM8. */ +#define QURT_ETM_TRACE_FILTER_HNUM9 (1U<<9) /*< Filter HNUM9. */ +#define QURT_ETM_TRACE_FILTER_HNUM10 (1U<<10) /*< Filter HNUM10. */ +#define QURT_ETM_TRACE_FILTER_HNUM11 (1U<<11) /*< Filter HNUM11. */ +#define QURT_ETM_TRACE_FILTER_HNUM12 (1U<<12) /*< Filter HNUM12. */ +#define QURT_ETM_TRACE_FILTER_HNUM13 (1U<<13) /*< Filter HNUM13. */ +#define QURT_ETM_TRACE_FILTER_HNUM14 (1U<<14) /*< Filter HNUM14. */ +#define QURT_ETM_TRACE_FILTER_HNUM15 (1U<<15) /*< Filter HNUM15. */ +#define QURT_ETM_TRACE_FILTER_ALL QURT_ETM_TRACE_FILTER_ALL_DEFAULT + +#define QURT_ETM_TRACE_FILTER_CLUSTER0 (1<<16) /*< Filter trace cluster0 address. */ +#define QURT_ETM_TRACE_FILTER_CLUSTER1 (1<<17) /*< Filter trace cluster1 address. */ +#define QURT_ETM_TRACE_FILTER_PC_RANGE (1<<19) /*< Filter PC address range. */ + +/* ETM memory source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< ETM memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< ETM memory source of SAC* is data. */ + +/* Period between synchronization traces */ +#define QURT_ETM_ASYNC_PERIOD 0 /**< Async.*/ +#define QURT_ETM_ISYNC_PERIOD 1 /**< Isync.*/ +#define QURT_ETM_GSYNC_PERIOD 2 /**< Gsync. */ + +/* ETM enable flags */ +#define QURT_ETM_OFF 0U /**< ETM off. */ +#define QURT_ETM_ON 1U /**< ETM on. */ +/** @endcond */ +/** @} */ /* end_addtogroup etm_macros */ + +/** @addtogroup function_tracing_macro +@{ */ +/* ETM setup return values */ +#define QURT_ETM_SETUP_OK 0 /**< ETM setup OK. */ +#define QURT_ETM_SETUP_ERR 1 /**< ETM setup error. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* ETM breakpoint types */ +#define QURT_ETM_READWRITE_BRKPT 0U /**< ETM read/write breakpoint. */ +#define QURT_ETM_READ_BRKPT 1U /**< ETM read breakpoint. */ +#define QURT_ETM_WRITE_BRKPT 2U /**< ETM write breakpoint. */ +#define QURT_ETM_BRKPT_INVALIDATE 3U /**< Invalidate breakpoint. */ +/** @addtogroup function_tracing_macro +@{ */ +/* ATB status flags */ +#define QURT_ATB_OFF 0 /**< ATB off. */ +#define QURT_ATB_ON 1 /**< ATB on. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* DTM enable flags */ +#define QURT_DTM_OFF 0 /**< DTM off. */ +#define QURT_DTM_ON 1 /**< DTM on. */ + +/** @addtogroup function_tracing_datatypes +@{ */ +/**STM trace information. */ +typedef struct qurt_stm_trace_info { + /** @cond */ + unsigned int stm_port_addr[6]; /* STM port address to which trace data must be written.*/ + unsigned int thread_event_id; /* Event ID for context switches.*/ + unsigned int interrupt_event_id; /* Event ID for interrupts. */ + unsigned int marker; /* Marker value that must be written at the beginning of the trace. */ + /** @endcond */ +} qurt_stm_trace_info_t; +/** @} */ /* end_addtogroup function_tracing_datatypes */ +/*============================================================================= + GLOBAL FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_trace_get_marker + Gets the kernel trace marker.\n + Returns the current value of the kernel trace marker. + The marker consists of a hardware thread identifier and an index into the kernel trace + buffer. The trace buffer records kernel events. + + @note1hang Using this function with qurt_trace_changed() + determines whether certain kernel events occurred in a block of code. + + @return + Integer -- Kernel trace marker. + + @dependencies + None. +*/ +unsigned int qurt_trace_get_marker(void); + +/**@ingroup func_qurt_trace_changed + Determines whether specific kernel events have occurred. \n + Returns a value that indicates whether the specified kernel events are recorded in the + kernel trace buffer since the specified kernel trace marker was obtained. + + The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling + qurt_trace_get_marker(). + @cond rest_dist For more information on the mask value, see the description of the trace_mask element in + @xhyperref{80VB41992,80-VB419-92}. \n @endcond + + @note1hang Used with qurt_trace_get_marker(), this function determines whether + certain kernel events occurred in a block of code.\n + @note1cont This function cannot determine whether a specific kernel event type has + occurred unless that event type has been enabled in the trace_mask element + of the system configuration file. \n + @note1cont QuRT supports the recording of interrupt and context switch events only (such as + a trace_mask value of 0x3). + + @param[in] prev_trace_marker Previous kernel trace marker. + @param[in] trace_mask Mask value that indicates which kernel events to check for. + + @returns + 1 -- Kernel events of the specified type have occurred since the + specified trace marker was obtained.\n + 0 -- No kernel events of the specified type have occurred since the + specified trace marker was obtained. + + @dependencies + None. +*/ +int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask); + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup function_tracing_macro +@{ */ +#ifndef QURT_DEBUG +#define QURT_TRACE(str, ...) __VA_ARGS__ + /**< Function tracing is implemented with the QURT_TRACE debug macro, which + optionally generates printf statements both before and after every function call that is + passed as a macro argument. + + For example, in the following macro calls in the source code: + @code + QURT_TRACE(myfunc, my_func(33)) + + @endcode + generates the following debug output: + @code + myfile:nnn: my_func >>> calling my_func(33) + myfile:nnn: my_func >>> returned my_func(33) + @endcode + The debug output includes the source file and line number of the function call, along with + the text of the call. Compile the client source file with -D __FILENAME__ + defined for its file name. + + The library function qurt_printf() generates the debug output. + The QURT_DEBUG symbol controls generation of the debug output. If this symbol is + not defined, function tracing is not generated.\n + @note1hang The debug macro is accessed through the QuRT API header file. + */ +#else +#define QURT_TRACE(str, ...) \ + do { \ + qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + __VA_ARGS__; \ + qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + } while (0); +#endif +/** @} */ /* end_addtogroup function_tracing_macro */ + +/**@ingroup func_qurt_etm_set_pc_range + Sets the PC address range for ETM filtering. + Depending on the Hexagon core design, a maximum of four PC ranges are supported. + + @param[in] range_num 0 to 3. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_range + Sets the address range for ETM filtering. + It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA. + + @param[in] addr_source_type Type of the address source:\n + - #QURT_ETM_SOURCE_PC \n + - #QURT_ETM_SOURCE_DATA @tablebulletend + @param[in] trig_block_num 0 to 3. + @param[in] pid pid of the process + 1. Any valid PID number will enable the ASID based trace filtering. + 2. QURT_ETM_NO_PID - Disable the ASID based trace filtering. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_atb + Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled. + QuRT performs the corresponding actions at low power management. + + @param[in] flag Values: \n + #QURT_ATB_ON \n + #QURT_ATB_OFF + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure + + @dependencies + None. +*/ +unsigned int qurt_etm_set_atb(unsigned int flag); + +/**@ingroup func_qurt_etm_set_sync_period + Sets the period for types of synchronization trace packets. \n + ASYNC defines the period between alignment synchronization packets. + Period is in terms of bytes in the packet stream. \n + ISYNC defines the period between instruction synchronization packets. + Period is per thread and is defined as the bytes sent out for that thread. \n + GSYNC is the defined period in thread cycles between GSYNC packets. + + @param[in] sync_type Type of synchronization packets: \n + #QURT_ETM_ASYNC_PERIOD \n + #QURT_ETM_ISYNC_PERIOD \n + #QURT_ETM_GSYNC_PERIOD + @param[in] period Period value. + + @return + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. + */ +unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period); + +/**@ingroup func_qurt_stm_trace_set_config + Sets up a STM port for tracing events. + + @datatypes + #qurt_stm_trace_info_t + + @param[in] stm_config_info Pointer to the STM trace information used to set up the trace + in the kernel. + The strucure must have the following:\n + - One port address per hardware thread \n + - Event ID for context switches \n + - Event ID for interrupt tracing n + - Header or marker to identify the beginning of the trace. @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table. + + @dependencies + None. + */ +unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TRACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_types.h new file mode 100755 index 0000000000000..bdb83a3fe2fb2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_types.h @@ -0,0 +1,294 @@ +#ifndef QURT_TYPES_H +#define QURT_TYPES_H +/** + @file qurt_types.h + @brief Contains types common to all configurations + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +//#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define PGA_BITFIELD_MASK(hi,lo) (((~0u)>>(31U-((hi)-(lo))))<<(lo)) +#define PGA_BITFIELD_GET(x,hi,lo) (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo)) +#define PGA_BITFIELD_INS(hi,lo,v) (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo))) +#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v))) +#define QURT_PGATTR_C_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 3U, 0U) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 5U, 4U) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_C_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v)) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v)) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_MKRAW(v) ((qurt_pgattr_t){.pga_value = (v)}) +#define QURT_PGATTR_MK(c,a) QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a))) + +/*return types for qurt_island_get_status2*/ +#define QURT_ISLAND_MODE_NORMAL 0U /**< Normal operating mode */ +#define QURT_ISLAND_MODE_ISLAND 1U /**< Island mode */ +#define QURT_ISLAND_MODE_EXITING 2U /**< In transition from Island mode to Normal mode */ + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/** @addtogroup memory_management_types +@{ */ +typedef unsigned int qurt_addr_t; /**< QuRT address type.*/ +typedef unsigned int qurt_paddr_t; /**< QuRT physical memory address type. */ +/** @cond rest_reg_dist */ +typedef unsigned long long qurt_addr_64_t; /**< QuRT 64-bit memory address type. */ +typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */ +typedef unsigned int qurt_mem_region_t; /**< QuRT memory regions type. */ +typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */ +/**@endcond */ +typedef unsigned int qurt_mem_pool_t; /**< QuRT memory pool type.*/ +typedef unsigned int qurt_size_t; /**< QuRT size type. */ +/** @cond */ +typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */ +#define QURT_PHYSPOOL_NAME_LEN (32) +typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN]; + + +/* + * Mapping type + * + * QMEM_MAPPING_VIRTUAL is the default mode, in which the system + * picks up the available range of the virtual address, and maps it to + * available contiguous physical addresses. Physical-to-virtual + * is not guaranteed to be 1:1; both virtual and physical memory is + * contiguous. + * + * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address; + * the kernel allocates 1:1 physical-to-virtual memory. Primary use of + * of this mapping is to allocate physical-to-virtual memory 1:1. + * + * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might + * not be the same as the physical address. But the physical address of the + * memory region is guaranteed to be contiguous starting at the provided + * address, it is required to provide a fixed physical address. The primary + * use of this mapping is to allocate physical memory from a particular + * address, where 1:1 physical-to-virtual is not required. + * + * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory + * area (VMA); no physical memory is reserved or mapped to this virtual + * space; all standard qmem_region APIs apply to a VMA, however physical + * address is always INVALID_ADDR. qmem_region_create() in this mode + * returns a handle to the VMA, both virt_addr and phys_addr must + * be set to INVALID_ADDR, kernel allocates any available virtual + * memory of the specified size. Obtain the starting virtual address + * of VMA through qmem_region_attr_getvirtaddr(). + * Primary purpose of this mapping mode is to provide a mechanism for + * delayed binding in QuRT, for example reserve virtual memory and map it at + * some later time to possibly discontiguous physical blocks. Thus, a + * single VMA can be partitioned among several physical-virtual mappings + * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode. + * Each VMA keeps track of associated mapped regions. + * Deletion of VMA succeeds only if all associated "virtual_fixed" + * regions are freed prior to VMA deletion. + * + * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region + * from virtual space that has been reserved via qmem_region_create() + * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if + * phys_addr is specified, the kernel attempts to map it accordingly, + * if no phys_addr is specified, kernel maps any available physical + * memory. All standard qmem_region APIs apply to such region. Remapping + * a virtual range without prior freeing of the region is not permitted. + * When such region is deleted its corresponding VMA remains intact. + * + * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous + * virtual memory but physical memory can be discontiguous. This method + * tries to club small physical memory blocks to obtain requested + * memory and is useful in case where there is no contiguous full block + * of requested size. If client does not need contiguous physical memory, + * (for example, if client does not use physical addressing), this helps + * use smaller physical memory blocks rather than using contiguous memory. + * Note: When memory is allocated through this method, physical address is + * not returned to the caller using the qurt_mem_region_attr_get() API as there might + * not be a single physical address. + * + */ +/**@endcond */ +/** QuRT memory region mapping type. */ +typedef enum { + QURT_MEM_MAPPING_VIRTUAL=0, /**< Default mode. The region virtual address range maps to an + available contiguous area of physical memory. For the most + efficient use of virtual memory, the QuRT system + chooses the base address in physical memory. This works for most memory + use cases.*/ + QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1, /**< The region virtual address space must be mapped to a + contiguous area of physical memory. This is necessary when the + memory region is accessed by external devices that bypass Hexagon + virtual memory addressing. The base address in physical + memory must be explicitly specified.*/ + QURT_MEM_MAPPING_IDEMPOTENT=2, /**< Region virtual address space maps + to the identical area of physical memory. */ + QURT_MEM_MAPPING_VIRTUAL_FIXED=3, /**< Virtual address space of the region maps either to the + specified area of physical memory or (if no area is specified) + to available physical memory. Use this mapping to create + regions from virtual space that was reserved by calling + qurt_mem_region_create() with mapping. */ + QURT_MEM_MAPPING_NONE=4, /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not + permitted without first deleting the memory region. When such a region is + deleted, its corresponding virtual memory addressing remains intact. */ + QURT_MEM_MAPPING_VIRTUAL_RANDOM=7, /**< System chooses a random virtual address and + maps it to available contiguous physical addresses.*/ + QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical + memory blocks. This helps when there are smaller contiguous blocks + than the requested size. + Physical address is not provided as part of the get_attr call */ + QURT_MEM_MAPPING_INVALID=10, /**< Reserved as an invalid mapping type. */ +} qurt_mem_mapping_t; + + +/** QuRT cache mode type. */ +typedef enum { + QURT_MEM_CACHE_WRITEBACK=7, /**< Write back. */ + QURT_MEM_CACHE_NONE_SHARED=6, /**< Normal uncached memory that can be shared with other subsystems.*/ + QURT_MEM_CACHE_WRITETHROUGH=5, /**< Write through. */ + QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0, /**< Write back non-L2-cacheable.*/ + QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1, /**< Write through non-L2-cacheable. */ + QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK, /**< Write back L2 cacheable. */ + QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH, /**< Write through L2 cacheable. */ + QURT_MEM_CACHE_DEVICE = 4, /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/ + QURT_MEM_CACHE_NONE = 4, /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */ + QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */ + QURT_MEM_CACHE_INVALID=10, /**< Reserved as an invalid cache type. */ +} qurt_mem_cache_mode_t; + +/** Memory access permission. */ +#define QURT_PERM_NONE 0x0U /**< No permission. */ +#define QURT_PERM_READ 0x1U /**< Read permission. */ +#define QURT_PERM_WRITE 0x2U /**< Write permission. */ +#define QURT_PERM_EXECUTE 0x4U /**< Execution permission. */ +#define QURT_PERM_NODUMP 0x8U + /**< Skip dumping the mapping. During process domain dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and DSP process + crashed before the mapping is removed. */ +#define QURT_PERM_FULL QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE /**< Read, write, and execute permission. */ + +typedef unsigned char qurt_perm_t; + + +/** @cond rest_reg_dist*/ +/** QuRT cache type; specifies data cache or instruction cache. */ +typedef enum { + QURT_MEM_ICACHE, /**< Instruction cache.*/ + QURT_MEM_DCACHE /**< Data cache.*/ +} qurt_mem_cache_type_t; + +/** QuRT cache operation code type. */ +typedef enum { + QURT_MEM_CACHE_FLUSH, /**< Flush. */ + QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */ + QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */ + QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */ + QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/ +} qurt_mem_cache_op_t; + +/** QuRT memory region type. */ +typedef enum { + QURT_MEM_REGION_LOCAL=0, /**< Local. */ + QURT_MEM_REGION_SHARED=1, /**< Shared.*/ + QURT_MEM_REGION_USER_ACCESS=2, /**< User access. */ + QURT_MEM_REGION_FS=4, /**< FS. */ + QURT_MEM_REGION_INVALID=10, /**< Reserved as an invalid region type. */ +} qurt_mem_region_type_t; + +/* Cache and bus attributes are combined into a value of this type for convenience, + and macros for combining and extracting fields are defined here. */ +/** @cond */ +struct qurt_pgattr { + unsigned pga_value; /**< PGA value.*/ +}; +typedef struct qurt_pgattr qurt_pgattr_t; +/** @endcond */ +/** QuRT memory region attributes type.*/ +/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr. + virtaddr cannot be specified for a memory region, it can only be queried by the + qmem_attr_getvirtaddr() function. + */ +typedef struct { + /** @cond */ + qurt_mem_mapping_t mapping_type; + unsigned char perms; + unsigned short owner; + qurt_pgattr_t pga; + unsigned ppn; //physical page number (physical>>12) + qurt_addr_t virtaddr; + qurt_mem_region_type_t type; + qurt_size_t size; + /** @endcond */ +} qurt_mem_region_attr_t; + + +/** QuRT user physical memory pool type. */ +typedef struct { + /** @cond */ + char name[32]; + struct ranges{ + unsigned int start; + unsigned int size; + } ranges[MAX_POOL_RANGES]; + /** @endcond */ +} qurt_mem_pool_attr_t; + +/** QuRT memory pool status type.*/ +typedef struct _qurt_mem_pool_status { + + qurt_size_t contig_size; /**< Largest contiguous free memory in bytes. */ + qurt_size_t free_size; /**< Total free memory in bytes. */ + qurt_size_t total_size; /**< Total declared memory in bytes. */ + +} qurt_mem_pool_status_t; + +typedef enum { + HEXAGON_L1_I_CACHE = 0, /**< Hexagon L1 instruction cache. */ + HEXAGON_L1_D_CACHE = 1, /**< Hexagon L1 data cache. */ + HEXAGON_L2_CACHE = 2 /**< Hexagon L2 cache. */ +} qurt_cache_type_t; + +typedef enum { + FULL_SIZE = 0, /**< Fully shared cache, without partitioning. */ + HALF_SIZE = 1, /**< 1/2 for main, 1/2 for auxiliary. */ + THREE_QUARTER_SIZE = 2, /**< 3/4 for main, 1/4 for auxiliary. */ + SEVEN_EIGHTHS_SIZE = 3 /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */ +} qurt_cache_partition_size_t; + +typedef enum { + QURT_PROCESS_CB_GENERIC, /**< generic unconditional cb called after image loading. */ + QURT_PROCESS_NOTE_CB_PRE_MAP, /**< note cb called before segment loading. */ + QURT_PROCESS_NOTE_CB_POST_MAP /**< note cb called after segment loading. */ +} qurt_process_cb_type_t; + +typedef union { + void *ptr; + int num; +} qurt_process_callback_arg_t; + + +/**@endcond*/ + +/** @} */ /* end_addtogroup memory_management_types */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TYPES_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_user_dma.h new file mode 100755 index 0000000000000..e05a6429fd703 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_user_dma.h @@ -0,0 +1,44 @@ +#ifndef QURT_USER_DMA_H +#define QURT_USER_DMA_H + +/** + @file qurt_user_dma.h + @brief Definitions, macros, and prototypes used for handling user DMA. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_user_dma_dmsyncht + Sends the DMSyncht command to the user DMA engine. + + Call this function to ensure all posted DMA memory operations are + complete. + + This stalls the current thread until the instruction + is complete and returns. + + @return + QURT_EOK - On dmsyncht completion \n + QURT_ENOTSUPPORTED - User DMA not supported + + @dependencies + None. +*/ +int qurt_user_dma_dmsyncht(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_vtlb.h new file mode 100755 index 0000000000000..e064042e447ac --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_vtlb.h @@ -0,0 +1,76 @@ +/*============================================================================= + + qurt_vtlb.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef QURT_VTLB_H +#define QURT_VTLB_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Names starting with "qurt_i_vtlb" are the internal low-level functions. +|| These should be considered subject to change. +*/ + +int qurt_i_vtlb_entry_create(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension); + +int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension, + unsigned target_pid); + +int qurt_i_vtlb_entry_delete(unsigned index); + +int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo); + +int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension); + +int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid); + +int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex); + +int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid); + + +int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries + // stats[1] -- number of available VTLB entries + // stats[2] -- max size of VTLB tree since boot + +//can return index to an entry that was specialed, change it to take addresses instead of pages +int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size); + +int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index); + +#define QURT_VTLB_EXT_DEFAULT 0U +#define QURT_VTLB_EXT_LOCKED 1U +#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U /* Temporary ability to skip certain mappings in pd dump */ +#define QURT_VTLB_EXT_FREELIST 0x800000u + +#define QURT_VTLB_ERR_OVERLAP -64 +#define QURT_VTLB_ERR_TREE_NO_SPACE -65 +#define QURT_VTLB_ERR_INVALID_SIZE -68 +#define QURT_VTLB_ERR_INVALID_EXT -69 +#define QURT_VTLB_ERR_DEL_PGT_LOCKED -70 +#define QURT_VTLB_ERR_PGT_LOCK_CNT -71 + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_VTLB_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libposix.a new file mode 100755 index 0000000000000..6d29c02c51601 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurt.a new file mode 100755 index 0000000000000..8d97bbd7c3b58 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurtcfs.a new file mode 100755 index 0000000000000..eac612a670347 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_island.a new file mode 100755 index 0000000000000..7e5653a98850c Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_island.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_main.a new file mode 100755 index 0000000000000..f01114822787c Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_main.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libposix.a new file mode 100755 index 0000000000000..e8007300d0e4a Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurt.a new file mode 100755 index 0000000000000..c5977b8c3cc5e Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurtcfs.a new file mode 100755 index 0000000000000..eac612a670347 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libtimer.a new file mode 100755 index 0000000000000..a8bd4da88cace Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libtimer.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/confname.h new file mode 100755 index 0000000000000..d9ca3135501e3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/confname.h @@ -0,0 +1,528 @@ +#ifndef CONFNAME_H +#define CONFNAME_H +/** + @file confname.h + @brief Named literals for 'name' argument of sysconf, pathconf + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly. Instead include unistd.h. For now since + toolchain doesnt provide a hook by including bits/confname.h, we stick this + header in QuRT's sys/types.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +/* Values for the NAME argument to `pathconf' and `fpathconf'. */ +enum +{ + _PC_LINK_MAX, +#define _PC_LINK_MAX _PC_LINK_MAX + _PC_MAX_CANON, +#define _PC_MAX_CANON _PC_MAX_CANON + _PC_MAX_INPUT, +#define _PC_MAX_INPUT _PC_MAX_INPUT + _PC_NAME_MAX, +#define _PC_NAME_MAX _PC_NAME_MAX + _PC_PATH_MAX, +#define _PC_PATH_MAX _PC_PATH_MAX + _PC_PIPE_BUF, +#define _PC_PIPE_BUF _PC_PIPE_BUF + _PC_CHOWN_RESTRICTED, +#define _PC_CHOWN_RESTRICTED _PC_CHOWN_RESTRICTED + _PC_NO_TRUNC, +#define _PC_NO_TRUNC _PC_NO_TRUNC + _PC_VDISABLE, +#define _PC_VDISABLE _PC_VDISABLE + _PC_SYNC_IO, +#define _PC_SYNC_IO _PC_SYNC_IO + _PC_ASYNC_IO, +#define _PC_ASYNC_IO _PC_ASYNC_IO + _PC_PRIO_IO, +#define _PC_PRIO_IO _PC_PRIO_IO + _PC_SOCK_MAXBUF, +#define _PC_SOCK_MAXBUF _PC_SOCK_MAXBUF + _PC_FILESIZEBITS, +#define _PC_FILESIZEBITS _PC_FILESIZEBITS + _PC_REC_INCR_XFER_SIZE, +#define _PC_REC_INCR_XFER_SIZE _PC_REC_INCR_XFER_SIZE + _PC_REC_MAX_XFER_SIZE, +#define _PC_REC_MAX_XFER_SIZE _PC_REC_MAX_XFER_SIZE + _PC_REC_MIN_XFER_SIZE, +#define _PC_REC_MIN_XFER_SIZE _PC_REC_MIN_XFER_SIZE + _PC_REC_XFER_ALIGN, +#define _PC_REC_XFER_ALIGN _PC_REC_XFER_ALIGN + _PC_ALLOC_SIZE_MIN, +#define _PC_ALLOC_SIZE_MIN _PC_ALLOC_SIZE_MIN + _PC_SYMLINK_MAX, +#define _PC_SYMLINK_MAX _PC_SYMLINK_MAX + _PC_2_SYMLINKS +#define _PC_2_SYMLINKS _PC_2_SYMLINKS +}; + +/* Values for the argument to `sysconf'. */ +enum +{ + _SC_ARG_MAX, +#define _SC_ARG_MAX _SC_ARG_MAX + _SC_CHILD_MAX, +#define _SC_CHILD_MAX _SC_CHILD_MAX + _SC_CLK_TCK, +#define _SC_CLK_TCK _SC_CLK_TCK + _SC_NGROUPS_MAX, +#define _SC_NGROUPS_MAX _SC_NGROUPS_MAX + _SC_OPEN_MAX, +#define _SC_OPEN_MAX _SC_OPEN_MAX + _SC_STREAM_MAX, +#define _SC_STREAM_MAX _SC_STREAM_MAX + _SC_TZNAME_MAX, +#define _SC_TZNAME_MAX _SC_TZNAME_MAX + _SC_JOB_CONTROL, +#define _SC_JOB_CONTROL _SC_JOB_CONTROL + _SC_SAVED_IDS, +#define _SC_SAVED_IDS _SC_SAVED_IDS + _SC_REALTIME_SIGNALS, +#define _SC_REALTIME_SIGNALS _SC_REALTIME_SIGNALS + _SC_PRIORITY_SCHEDULING, +#define _SC_PRIORITY_SCHEDULING _SC_PRIORITY_SCHEDULING + _SC_TIMERS, +#define _SC_TIMERS _SC_TIMERS + _SC_ASYNCHRONOUS_IO, +#define _SC_ASYNCHRONOUS_IO _SC_ASYNCHRONOUS_IO + _SC_PRIORITIZED_IO, +#define _SC_PRIORITIZED_IO _SC_PRIORITIZED_IO + _SC_SYNCHRONIZED_IO, +#define _SC_SYNCHRONIZED_IO _SC_SYNCHRONIZED_IO + _SC_FSYNC, +#define _SC_FSYNC _SC_FSYNC + _SC_MAPPED_FILES, +#define _SC_MAPPED_FILES _SC_MAPPED_FILES + _SC_MEMLOCK, +#define _SC_MEMLOCK _SC_MEMLOCK + _SC_MEMLOCK_RANGE, +#define _SC_MEMLOCK_RANGE _SC_MEMLOCK_RANGE + _SC_MEMORY_PROTECTION, +#define _SC_MEMORY_PROTECTION _SC_MEMORY_PROTECTION + _SC_MESSAGE_PASSING, +#define _SC_MESSAGE_PASSING _SC_MESSAGE_PASSING + _SC_SEMAPHORES, +#define _SC_SEMAPHORES _SC_SEMAPHORES + _SC_SHARED_MEMORY_OBJECTS, +#define _SC_SHARED_MEMORY_OBJECTS _SC_SHARED_MEMORY_OBJECTS + _SC_AIO_LISTIO_MAX, +#define _SC_AIO_LISTIO_MAX _SC_AIO_LISTIO_MAX + _SC_AIO_MAX, +#define _SC_AIO_MAX _SC_AIO_MAX + _SC_AIO_PRIO_DELTA_MAX, +#define _SC_AIO_PRIO_DELTA_MAX _SC_AIO_PRIO_DELTA_MAX + _SC_DELAYTIMER_MAX, +#define _SC_DELAYTIMER_MAX _SC_DELAYTIMER_MAX + _SC_MQ_OPEN_MAX, +#define _SC_MQ_OPEN_MAX _SC_MQ_OPEN_MAX + _SC_MQ_PRIO_MAX, +#define _SC_MQ_PRIO_MAX _SC_MQ_PRIO_MAX + _SC_VERSION, +#define _SC_VERSION _SC_VERSION + _SC_PAGESIZE, +#define _SC_PAGESIZE _SC_PAGESIZE +#define _SC_PAGE_SIZE _SC_PAGESIZE + _SC_RTSIG_MAX, +#define _SC_RTSIG_MAX _SC_RTSIG_MAX + _SC_SEM_NSEMS_MAX, +#define _SC_SEM_NSEMS_MAX _SC_SEM_NSEMS_MAX + _SC_SEM_VALUE_MAX, +#define _SC_SEM_VALUE_MAX _SC_SEM_VALUE_MAX + _SC_SIGQUEUE_MAX, +#define _SC_SIGQUEUE_MAX _SC_SIGQUEUE_MAX + _SC_TIMER_MAX, +#define _SC_TIMER_MAX _SC_TIMER_MAX + + /* Values for the argument to `sysconf' + corresponding to _POSIX2_* symbols. */ + _SC_BC_BASE_MAX, +#define _SC_BC_BASE_MAX _SC_BC_BASE_MAX + _SC_BC_DIM_MAX, +#define _SC_BC_DIM_MAX _SC_BC_DIM_MAX + _SC_BC_SCALE_MAX, +#define _SC_BC_SCALE_MAX _SC_BC_SCALE_MAX + _SC_BC_STRING_MAX, +#define _SC_BC_STRING_MAX _SC_BC_STRING_MAX + _SC_COLL_WEIGHTS_MAX, +#define _SC_COLL_WEIGHTS_MAX _SC_COLL_WEIGHTS_MAX + _SC_EQUIV_CLASS_MAX, +#define _SC_EQUIV_CLASS_MAX _SC_EQUIV_CLASS_MAX + _SC_EXPR_NEST_MAX, +#define _SC_EXPR_NEST_MAX _SC_EXPR_NEST_MAX + _SC_LINE_MAX, +#define _SC_LINE_MAX _SC_LINE_MAX + _SC_RE_DUP_MAX, +#define _SC_RE_DUP_MAX _SC_RE_DUP_MAX + _SC_CHARCLASS_NAME_MAX, +#define _SC_CHARCLASS_NAME_MAX _SC_CHARCLASS_NAME_MAX + + _SC_2_VERSION, +#define _SC_2_VERSION _SC_2_VERSION + _SC_2_C_BIND, +#define _SC_2_C_BIND _SC_2_C_BIND + _SC_2_C_DEV, +#define _SC_2_C_DEV _SC_2_C_DEV + _SC_2_FORT_DEV, +#define _SC_2_FORT_DEV _SC_2_FORT_DEV + _SC_2_FORT_RUN, +#define _SC_2_FORT_RUN _SC_2_FORT_RUN + _SC_2_SW_DEV, +#define _SC_2_SW_DEV _SC_2_SW_DEV + _SC_2_LOCALEDEF, +#define _SC_2_LOCALEDEF _SC_2_LOCALEDEF + + _SC_PII, +#define _SC_PII _SC_PII + _SC_PII_XTI, +#define _SC_PII_XTI _SC_PII_XTI + _SC_PII_SOCKET, +#define _SC_PII_SOCKET _SC_PII_SOCKET + _SC_PII_INTERNET, +#define _SC_PII_INTERNET _SC_PII_INTERNET + _SC_PII_OSI, +#define _SC_PII_OSI _SC_PII_OSI + _SC_POLL, +#define _SC_POLL _SC_POLL + _SC_SELECT, +#define _SC_SELECT _SC_SELECT + _SC_UIO_MAXIOV, +#define _SC_UIO_MAXIOV _SC_UIO_MAXIOV + _SC_IOV_MAX = _SC_UIO_MAXIOV, +#define _SC_IOV_MAX _SC_IOV_MAX + _SC_PII_INTERNET_STREAM, +#define _SC_PII_INTERNET_STREAM _SC_PII_INTERNET_STREAM + _SC_PII_INTERNET_DGRAM, +#define _SC_PII_INTERNET_DGRAM _SC_PII_INTERNET_DGRAM + _SC_PII_OSI_COTS, +#define _SC_PII_OSI_COTS _SC_PII_OSI_COTS + _SC_PII_OSI_CLTS, +#define _SC_PII_OSI_CLTS _SC_PII_OSI_CLTS + _SC_PII_OSI_M, +#define _SC_PII_OSI_M _SC_PII_OSI_M + _SC_T_IOV_MAX, +#define _SC_T_IOV_MAX _SC_T_IOV_MAX + + /* Values according to POSIX 1003.1c (POSIX threads). */ + _SC_THREADS, +#define _SC_THREADS _SC_THREADS + _SC_THREAD_SAFE_FUNCTIONS, +#define _SC_THREAD_SAFE_FUNCTIONS _SC_THREAD_SAFE_FUNCTIONS + _SC_GETGR_R_SIZE_MAX, +#define _SC_GETGR_R_SIZE_MAX _SC_GETGR_R_SIZE_MAX + _SC_GETPW_R_SIZE_MAX, +#define _SC_GETPW_R_SIZE_MAX _SC_GETPW_R_SIZE_MAX + _SC_LOGIN_NAME_MAX, +#define _SC_LOGIN_NAME_MAX _SC_LOGIN_NAME_MAX + _SC_TTY_NAME_MAX, +#define _SC_TTY_NAME_MAX _SC_TTY_NAME_MAX + _SC_THREAD_DESTRUCTOR_ITERATIONS, +#define _SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS + _SC_THREAD_KEYS_MAX, +#define _SC_THREAD_KEYS_MAX _SC_THREAD_KEYS_MAX + _SC_THREAD_STACK_MIN, +#define _SC_THREAD_STACK_MIN _SC_THREAD_STACK_MIN + _SC_THREAD_THREADS_MAX, +#define _SC_THREAD_THREADS_MAX _SC_THREAD_THREADS_MAX + _SC_THREAD_ATTR_STACKADDR, +#define _SC_THREAD_ATTR_STACKADDR _SC_THREAD_ATTR_STACKADDR + _SC_THREAD_ATTR_STACKSIZE, +#define _SC_THREAD_ATTR_STACKSIZE _SC_THREAD_ATTR_STACKSIZE + _SC_THREAD_PRIORITY_SCHEDULING, +#define _SC_THREAD_PRIORITY_SCHEDULING _SC_THREAD_PRIORITY_SCHEDULING + _SC_THREAD_PRIO_INHERIT, +#define _SC_THREAD_PRIO_INHERIT _SC_THREAD_PRIO_INHERIT + _SC_THREAD_PRIO_PROTECT, +#define _SC_THREAD_PRIO_PROTECT _SC_THREAD_PRIO_PROTECT + _SC_THREAD_PROCESS_SHARED, +#define _SC_THREAD_PROCESS_SHARED _SC_THREAD_PROCESS_SHARED + + _SC_NPROCESSORS_CONF, +#define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_CONF + _SC_NPROCESSORS_ONLN, +#define _SC_NPROCESSORS_ONLN _SC_NPROCESSORS_ONLN + _SC_PHYS_PAGES, +#define _SC_PHYS_PAGES _SC_PHYS_PAGES + _SC_AVPHYS_PAGES, +#define _SC_AVPHYS_PAGES _SC_AVPHYS_PAGES + _SC_ATEXIT_MAX, +#define _SC_ATEXIT_MAX _SC_ATEXIT_MAX + _SC_PASS_MAX, +#define _SC_PASS_MAX _SC_PASS_MAX + + _SC_XOPEN_VERSION, +#define _SC_XOPEN_VERSION _SC_XOPEN_VERSION + _SC_XOPEN_XCU_VERSION, +#define _SC_XOPEN_XCU_VERSION _SC_XOPEN_XCU_VERSION + _SC_XOPEN_UNIX, +#define _SC_XOPEN_UNIX _SC_XOPEN_UNIX + _SC_XOPEN_CRYPT, +#define _SC_XOPEN_CRYPT _SC_XOPEN_CRYPT + _SC_XOPEN_ENH_I18N, +#define _SC_XOPEN_ENH_I18N _SC_XOPEN_ENH_I18N + _SC_XOPEN_SHM, +#define _SC_XOPEN_SHM _SC_XOPEN_SHM + + _SC_2_CHAR_TERM, +#define _SC_2_CHAR_TERM _SC_2_CHAR_TERM + _SC_2_C_VERSION, +#define _SC_2_C_VERSION _SC_2_C_VERSION + _SC_2_UPE, +#define _SC_2_UPE _SC_2_UPE + + _SC_XOPEN_XPG2, +#define _SC_XOPEN_XPG2 _SC_XOPEN_XPG2 + _SC_XOPEN_XPG3, +#define _SC_XOPEN_XPG3 _SC_XOPEN_XPG3 + _SC_XOPEN_XPG4, +#define _SC_XOPEN_XPG4 _SC_XOPEN_XPG4 + + _SC_CHAR_BIT, +#define _SC_CHAR_BIT _SC_CHAR_BIT + _SC_CHAR_MAX, +#define _SC_CHAR_MAX _SC_CHAR_MAX + _SC_CHAR_MIN, +#define _SC_CHAR_MIN _SC_CHAR_MIN + _SC_INT_MAX, +#define _SC_INT_MAX _SC_INT_MAX + _SC_INT_MIN, +#define _SC_INT_MIN _SC_INT_MIN + _SC_LONG_BIT, +#define _SC_LONG_BIT _SC_LONG_BIT + _SC_WORD_BIT, +#define _SC_WORD_BIT _SC_WORD_BIT + _SC_MB_LEN_MAX, +#define _SC_MB_LEN_MAX _SC_MB_LEN_MAX + _SC_NZERO, +#define _SC_NZERO _SC_NZERO + _SC_SSIZE_MAX, +#define _SC_SSIZE_MAX _SC_SSIZE_MAX + _SC_SCHAR_MAX, +#define _SC_SCHAR_MAX _SC_SCHAR_MAX + _SC_SCHAR_MIN, +#define _SC_SCHAR_MIN _SC_SCHAR_MIN + _SC_SHRT_MAX, +#define _SC_SHRT_MAX _SC_SHRT_MAX + _SC_SHRT_MIN, +#define _SC_SHRT_MIN _SC_SHRT_MIN + _SC_UCHAR_MAX, +#define _SC_UCHAR_MAX _SC_UCHAR_MAX + _SC_UINT_MAX, +#define _SC_UINT_MAX _SC_UINT_MAX + _SC_ULONG_MAX, +#define _SC_ULONG_MAX _SC_ULONG_MAX + _SC_USHRT_MAX, +#define _SC_USHRT_MAX _SC_USHRT_MAX + + _SC_NL_ARGMAX, +#define _SC_NL_ARGMAX _SC_NL_ARGMAX + _SC_NL_LANGMAX, +#define _SC_NL_LANGMAX _SC_NL_LANGMAX + _SC_NL_MSGMAX, +#define _SC_NL_MSGMAX _SC_NL_MSGMAX + _SC_NL_NMAX, +#define _SC_NL_NMAX _SC_NL_NMAX + _SC_NL_SETMAX, +#define _SC_NL_SETMAX _SC_NL_SETMAX + _SC_NL_TEXTMAX, +#define _SC_NL_TEXTMAX _SC_NL_TEXTMAX + + _SC_XBS5_ILP32_OFF32, +#define _SC_XBS5_ILP32_OFF32 _SC_XBS5_ILP32_OFF32 + _SC_XBS5_ILP32_OFFBIG, +#define _SC_XBS5_ILP32_OFFBIG _SC_XBS5_ILP32_OFFBIG + _SC_XBS5_LP64_OFF64, +#define _SC_XBS5_LP64_OFF64 _SC_XBS5_LP64_OFF64 + _SC_XBS5_LPBIG_OFFBIG, +#define _SC_XBS5_LPBIG_OFFBIG _SC_XBS5_LPBIG_OFFBIG + + _SC_XOPEN_LEGACY, +#define _SC_XOPEN_LEGACY _SC_XOPEN_LEGACY + _SC_XOPEN_REALTIME, +#define _SC_XOPEN_REALTIME _SC_XOPEN_REALTIME + _SC_XOPEN_REALTIME_THREADS, +#define _SC_XOPEN_REALTIME_THREADS _SC_XOPEN_REALTIME_THREADS + + _SC_ADVISORY_INFO, +#define _SC_ADVISORY_INFO _SC_ADVISORY_INFO + _SC_BARRIERS, +#define _SC_BARRIERS _SC_BARRIERS + _SC_BASE, +#define _SC_BASE _SC_BASE + _SC_C_LANG_SUPPORT, +#define _SC_C_LANG_SUPPORT _SC_C_LANG_SUPPORT + _SC_C_LANG_SUPPORT_R, +#define _SC_C_LANG_SUPPORT_R _SC_C_LANG_SUPPORT_R + _SC_CLOCK_SELECTION, +#define _SC_CLOCK_SELECTION _SC_CLOCK_SELECTION + _SC_CPUTIME, +#define _SC_CPUTIME _SC_CPUTIME + _SC_THREAD_CPUTIME, +#define _SC_THREAD_CPUTIME _SC_THREAD_CPUTIME + _SC_DEVICE_IO, +#define _SC_DEVICE_IO _SC_DEVICE_IO + _SC_DEVICE_SPECIFIC, +#define _SC_DEVICE_SPECIFIC _SC_DEVICE_SPECIFIC + _SC_DEVICE_SPECIFIC_R, +#define _SC_DEVICE_SPECIFIC_R _SC_DEVICE_SPECIFIC_R + _SC_FD_MGMT, +#define _SC_FD_MGMT _SC_FD_MGMT + _SC_FIFO, +#define _SC_FIFO _SC_FIFO + _SC_PIPE, +#define _SC_PIPE _SC_PIPE + _SC_FILE_ATTRIBUTES, +#define _SC_FILE_ATTRIBUTES _SC_FILE_ATTRIBUTES + _SC_FILE_LOCKING, +#define _SC_FILE_LOCKING _SC_FILE_LOCKING + _SC_FILE_SYSTEM, +#define _SC_FILE_SYSTEM _SC_FILE_SYSTEM + _SC_MONOTONIC_CLOCK, +#define _SC_MONOTONIC_CLOCK _SC_MONOTONIC_CLOCK + _SC_MULTI_PROCESS, +#define _SC_MULTI_PROCESS _SC_MULTI_PROCESS + _SC_SINGLE_PROCESS, +#define _SC_SINGLE_PROCESS _SC_SINGLE_PROCESS + _SC_NETWORKING, +#define _SC_NETWORKING _SC_NETWORKING + _SC_READER_WRITER_LOCKS, +#define _SC_READER_WRITER_LOCKS _SC_READER_WRITER_LOCKS + _SC_SPIN_LOCKS, +#define _SC_SPIN_LOCKS _SC_SPIN_LOCKS + _SC_REGEXP, +#define _SC_REGEXP _SC_REGEXP + _SC_REGEX_VERSION, +#define _SC_REGEX_VERSION _SC_REGEX_VERSION + _SC_SHELL, +#define _SC_SHELL _SC_SHELL + _SC_SIGNALS, +#define _SC_SIGNALS _SC_SIGNALS + _SC_SPAWN, +#define _SC_SPAWN _SC_SPAWN + _SC_SPORADIC_SERVER, +#define _SC_SPORADIC_SERVER _SC_SPORADIC_SERVER + _SC_THREAD_SPORADIC_SERVER, +#define _SC_THREAD_SPORADIC_SERVER _SC_THREAD_SPORADIC_SERVER + _SC_SYSTEM_DATABASE, +#define _SC_SYSTEM_DATABASE _SC_SYSTEM_DATABASE + _SC_SYSTEM_DATABASE_R, +#define _SC_SYSTEM_DATABASE_R _SC_SYSTEM_DATABASE_R + _SC_TIMEOUTS, +#define _SC_TIMEOUTS _SC_TIMEOUTS + _SC_TYPED_MEMORY_OBJECTS, +#define _SC_TYPED_MEMORY_OBJECTS _SC_TYPED_MEMORY_OBJECTS + _SC_USER_GROUPS, +#define _SC_USER_GROUPS _SC_USER_GROUPS + _SC_USER_GROUPS_R, +#define _SC_USER_GROUPS_R _SC_USER_GROUPS_R + _SC_2_PBS, +#define _SC_2_PBS _SC_2_PBS + _SC_2_PBS_ACCOUNTING, +#define _SC_2_PBS_ACCOUNTING _SC_2_PBS_ACCOUNTING + _SC_2_PBS_LOCATE, +#define _SC_2_PBS_LOCATE _SC_2_PBS_LOCATE + _SC_2_PBS_MESSAGE, +#define _SC_2_PBS_MESSAGE _SC_2_PBS_MESSAGE + _SC_2_PBS_TRACK, +#define _SC_2_PBS_TRACK _SC_2_PBS_TRACK + _SC_SYMLOOP_MAX, +#define _SC_SYMLOOP_MAX _SC_SYMLOOP_MAX + _SC_STREAMS, +#define _SC_STREAMS _SC_STREAMS + _SC_2_PBS_CHECKPOINT, +#define _SC_2_PBS_CHECKPOINT _SC_2_PBS_CHECKPOINT + + _SC_V6_ILP32_OFF32, +#define _SC_V6_ILP32_OFF32 _SC_V6_ILP32_OFF32 + _SC_V6_ILP32_OFFBIG, +#define _SC_V6_ILP32_OFFBIG _SC_V6_ILP32_OFFBIG + _SC_V6_LP64_OFF64, +#define _SC_V6_LP64_OFF64 _SC_V6_LP64_OFF64 + _SC_V6_LPBIG_OFFBIG, +#define _SC_V6_LPBIG_OFFBIG _SC_V6_LPBIG_OFFBIG + + _SC_HOST_NAME_MAX, +#define _SC_HOST_NAME_MAX _SC_HOST_NAME_MAX + _SC_TRACE, +#define _SC_TRACE _SC_TRACE + _SC_TRACE_EVENT_FILTER, +#define _SC_TRACE_EVENT_FILTER _SC_TRACE_EVENT_FILTER + _SC_TRACE_INHERIT, +#define _SC_TRACE_INHERIT _SC_TRACE_INHERIT + _SC_TRACE_LOG, +#define _SC_TRACE_LOG _SC_TRACE_LOG + + _SC_LEVEL1_ICACHE_SIZE, +#define _SC_LEVEL1_ICACHE_SIZE _SC_LEVEL1_ICACHE_SIZE + _SC_LEVEL1_ICACHE_ASSOC, +#define _SC_LEVEL1_ICACHE_ASSOC _SC_LEVEL1_ICACHE_ASSOC + _SC_LEVEL1_ICACHE_LINESIZE, +#define _SC_LEVEL1_ICACHE_LINESIZE _SC_LEVEL1_ICACHE_LINESIZE + _SC_LEVEL1_DCACHE_SIZE, +#define _SC_LEVEL1_DCACHE_SIZE _SC_LEVEL1_DCACHE_SIZE + _SC_LEVEL1_DCACHE_ASSOC, +#define _SC_LEVEL1_DCACHE_ASSOC _SC_LEVEL1_DCACHE_ASSOC + _SC_LEVEL1_DCACHE_LINESIZE, +#define _SC_LEVEL1_DCACHE_LINESIZE _SC_LEVEL1_DCACHE_LINESIZE + _SC_LEVEL2_CACHE_SIZE, +#define _SC_LEVEL2_CACHE_SIZE _SC_LEVEL2_CACHE_SIZE + _SC_LEVEL2_CACHE_ASSOC, +#define _SC_LEVEL2_CACHE_ASSOC _SC_LEVEL2_CACHE_ASSOC + _SC_LEVEL2_CACHE_LINESIZE, +#define _SC_LEVEL2_CACHE_LINESIZE _SC_LEVEL2_CACHE_LINESIZE + _SC_LEVEL3_CACHE_SIZE, +#define _SC_LEVEL3_CACHE_SIZE _SC_LEVEL3_CACHE_SIZE + _SC_LEVEL3_CACHE_ASSOC, +#define _SC_LEVEL3_CACHE_ASSOC _SC_LEVEL3_CACHE_ASSOC + _SC_LEVEL3_CACHE_LINESIZE, +#define _SC_LEVEL3_CACHE_LINESIZE _SC_LEVEL3_CACHE_LINESIZE + _SC_LEVEL4_CACHE_SIZE, +#define _SC_LEVEL4_CACHE_SIZE _SC_LEVEL4_CACHE_SIZE + _SC_LEVEL4_CACHE_ASSOC, +#define _SC_LEVEL4_CACHE_ASSOC _SC_LEVEL4_CACHE_ASSOC + _SC_LEVEL4_CACHE_LINESIZE, +#define _SC_LEVEL4_CACHE_LINESIZE _SC_LEVEL4_CACHE_LINESIZE + /* Leave room here, maybe we need a few more cache levels some day. */ + + _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50, +#define _SC_IPV6 _SC_IPV6 + _SC_RAW_SOCKETS, +#define _SC_RAW_SOCKETS _SC_RAW_SOCKETS + + _SC_V7_ILP32_OFF32, +#define _SC_V7_ILP32_OFF32 _SC_V7_ILP32_OFF32 + _SC_V7_ILP32_OFFBIG, +#define _SC_V7_ILP32_OFFBIG _SC_V7_ILP32_OFFBIG + _SC_V7_LP64_OFF64, +#define _SC_V7_LP64_OFF64 _SC_V7_LP64_OFF64 + _SC_V7_LPBIG_OFFBIG, +#define _SC_V7_LPBIG_OFFBIG _SC_V7_LPBIG_OFFBIG + + _SC_SS_REPL_MAX, +#define _SC_SS_REPL_MAX _SC_SS_REPL_MAX + + _SC_TRACE_EVENT_NAME_MAX, +#define _SC_TRACE_EVENT_NAME_MAX _SC_TRACE_EVENT_NAME_MAX + _SC_TRACE_NAME_MAX, +#define _SC_TRACE_NAME_MAX _SC_TRACE_NAME_MAX + _SC_TRACE_SYS_MAX, +#define _SC_TRACE_SYS_MAX _SC_TRACE_SYS_MAX + _SC_TRACE_USER_EVENT_MAX, +#define _SC_TRACE_USER_EVENT_MAX _SC_TRACE_USER_EVENT_MAX + + _SC_XOPEN_STREAMS, +#define _SC_XOPEN_STREAMS _SC_XOPEN_STREAMS + + _SC_THREAD_ROBUST_PRIO_INHERIT, +#define _SC_THREAD_ROBUST_PRIO_INHERIT _SC_THREAD_ROBUST_PRIO_INHERIT + _SC_THREAD_ROBUST_PRIO_PROTECT +#define _SC_THREAD_ROBUST_PRIO_PROTECT _SC_THREAD_ROBUST_PRIO_PROTECT + +}; +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/posix1_lim.h new file mode 100755 index 0000000000000..0739958c5a6c4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/posix1_lim.h @@ -0,0 +1,34 @@ +#ifndef POSIX1_LIM_H +#define POSIX1_LIM_H +/** + @file posix1_lim.h + @brief POSIX Minimum values + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +TODO + This header should be ideally relocated under api/posix/bits (something that + doesnt exist today) and be included from api/posix/bits/limits.h which inturn + should be included from toolchain's limits.h + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +#ifndef _POSIX_PATH_MAX +/** @brief Maximum number of bytes in a pathname, including the terminating + nul character */ +#define _POSIX_PATH_MAX 256 +#endif + +#ifndef _POSIX_SEM_NSEMS_MAX +/** @brief Maximum number of semaphores that a process may have */ +#define _POSIX_SEM_NSEMS_MAX 16 +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/common/time.h new file mode 100755 index 0000000000000..76b0d39ab7039 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/common/time.h @@ -0,0 +1 @@ +#include \ No newline at end of file diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/fcntl.h new file mode 100755 index 0000000000000..c80ec98a449b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/fcntl.h @@ -0,0 +1,51 @@ +#ifndef _FCNTL_H +#define _FCNTL_H + +/*========================================================================== + * FILE: fcntl.h + * + * SERVICES: POSIX fcntl.h + * + * DESCRIPTION: The header is needed by the open() and fcntl() + * system calls, which have a variety of parameters and + * flags. They are described here. + * + * The formats of the calls to each of these are: + * + * open(path, oflag [,mode]) open a file + * fcntl(fd, cmd [,arg]) get or set file attributes + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Oflag values for open(). POSIX Table 6-4. */ +#define POSIX_O_CREAT 0x100 /* creat file if it doesn't exist */ +#define POSIX_O_EXCL 0x200 /* exclusive use flag */ +#define POSIX_O_NOCTTY 0x400 /* do not assign a controlling terminal */ +#define POSIX_O_TRUNC 0x1000 /* truncate flag */ + +/* File status flags for open() and fcntl(). POSIX Table 6-5. */ +#define POSIX_O_APPEND 0x2000 /* set append mode */ +#define POSIX_O_NONBLOCK 0x4000 /* no delay */ + +/* File access modes for open() and fcntl(). POSIX Table 6-6. */ +#define POSIX_O_RDONLY 0 /* open(name, POSIX_O_RDONLY) opens read only */ +#define POSIX_O_WRONLY 1 /* open(name, POSIX_O_WRONLY) opens write only */ +#define POSIX_O_RDWR 2 /* open(name, POSIX_O_RDWR) opens read/write */ + +/* Mask for use with file access modes. POSIX Table 6-7. */ +#define POSIX_O_ACCMODE 0x3 /* mask for file access modes */ + +#ifdef __cplusplus +} +#endif + +#endif /* _FCNTL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/hooks/unistd.h new file mode 100755 index 0000000000000..1c618bfe36b4f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/hooks/unistd.h @@ -0,0 +1,115 @@ +#ifndef UNISTD_H +#define UNISTD_H +/** + @file posix/hooks/unistd.h + @brief POSIX related declarations in that are missing in toolchain + header + +EXTERNAL FUNCTIONS + None + +INITIALIZATION AND SEQUENCING REQUIREMENTS + DONT include this header directly! Instead include unistd.h. + +Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include /* For various POSIX ID types from toolchain headers */ + +#ifdef __cplusplus +extern "C" { +#endif +extern long pathconf (char const * path, int name); + +/* Process*/ + +/** The getppid() function shall return the parent process ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the parent process ID + */ +pid_t getppid(void); + +/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid + * Please refer to POSIX standard for details. + * @param thread [in] process ID + * @param value_ptr [out] process group ID + */ +pid_t getpgid(pid_t pid); + +/** The getpgrp() function shall return the process group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] process group ID of the calling process + */ +pid_t getpgrp(void); + +/**The getuid() function shall return the real user ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] the real user ID of the calling process. + */ +uid_t getuid(void); + +/** The geteuid() function shall return the effective user ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective user ID of the calling process + */ +uid_t geteuid(void); + +/** The getegid() function shall return the effective group ID of the calling process. + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] effective group ID of the calling process. + */ +gid_t getegid(void); + +/** The getgid() function shall return the real group ID of the calling process + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] real group ID of the calling process. + */ + gid_t getgid(void); + +/** seteuid set effective user ID + * Please refer to POSIX standard for details. + * @param thread [in] effective user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int seteuid(uid_t uid); + +/** setpgrp - set the process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setpgrp(void); + +/** setuid - set user ID + * Please refer to POSIX standard for details. + * @param thread [in] user ID + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setuid(uid_t uid); + +/** setpgid - set process group ID for job control + * Please refer to POSIX standard for details. + * @param thread [in] PID of process, PGID to be set + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +int setpgid(pid_t pid, pid_t pgid); + +/** setsid - create session and set process group ID + * Please refer to POSIX standard for details. + * @param thread [in] none + * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error. + */ +pid_t setsid(void); + +#ifdef __cplusplus +} +#endif +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/mqueue.h new file mode 100755 index 0000000000000..74dcc2fa202c6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/mqueue.h @@ -0,0 +1,203 @@ +#ifndef _POSIX_MQUEUE_H_ +#define _POSIX_MQUEUE_H_ + +/*========================================================================== + * FILE: mqueue.h + * + * SERVICES: POSIX Message Queue API interface + * + * DESCRIPTION: POSIX Message Queue API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technlogies, Inc. + *==========================================================================*/ + +#include /*ssize_t */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MQ_PRIO_MAX 255 /* max priority */ +#define MQ_PRIO_DEFAULT 0 /* default priority */ + +typedef int mqd_t; + +struct mq_attr +{ + long mq_flags; /* message queue flags */ + long mq_maxmsg; /* maximum number of messages */ + long mq_msgsize; /* maximum message size */ + long mq_curmsgs; /* number of messages currently queued */ +}; + +typedef struct mq_attr mqueue_attr; + +/** \details + * This provides POSIX Message Queue API. + * + * mq_notify is not supported. + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * it only supports Message sending and receiving within one process. + * Message sending and receiving among processes are not supported. + */ + +/** \defgroup mqueue POSIX Message Queue API */ +/** \ingroup mqueue */ +/** @{ */ + +/** Open a message queue. + * Please refer to POSIX standard for details. + */ +mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...); + +/** Close a message queue. + * Please refer to POSIX standard for details. + */ +int mq_close(mqd_t mq_desc); + +/** Remove a message queue. + * Please refer to POSIX standard for details. + */ +int mq_unlink(const char *name); + +/** Send a message to a message queue. + * Please refer to POSIX standard for details. + * + * If the queue is full, instead of blocking the sender, this function + * will return -1 with errno EAGAIN, in this implementation. This behavior + * may change in the future. + */ +int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio); + +/** Send a message to a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); + +/** Receive a message from a message queue. + * Please refer to POSIX standard for details. + */ +ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio); + +/** Receive a message from a message queue with timeout. + * Please refer to POSIX standard for details. + * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout); + +/** Get message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat); + +/** Set message queue attributes. + * Please refer to POSIX standard for details. + */ +int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat); + +/** @} */ + +#define NBBY 8U /* number of bits in a byte */ + +/* + * Select uses bit masks of file descriptors in longs. These macros + * manipulate such bit fields (the filesystem macros use chars). + * FD_SETSIZE may be defined by the user, but the default here should + * be enough for most uses. + */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 256U +#endif + +typedef unsigned long fd_mask; +#define NFDBITS (sizeof(fd_mask) * (unsigned int)NBBY) /* bits per mask */ + +#ifndef howmany +#define howmany(x, y) (((x) + ((y) - 1U)) / (y)) +#endif + +//equivalent of fd_set fpr WINNT env +typedef struct fd_set +{ + fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)]; +} fd_set; + +/** \addtogroup mqueue */ +/** @{ */ + +/** Sets the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Clears the bit for the file descriptor fd in the file descriptor set fdset. + */ +#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS))) + +/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise. + */ +#define FD_ISSET(n, p) ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS))) + +/** Copies the file descriptor set. + */ +#define FD_COPY(f, t) (void)(memcpy)((t), (f), sizeof(*(f))) + +/** Initializes the file descriptor set fdset to have zero bits for all file descriptors. + */ +#define FD_ZERO(p) (void)memset((p), 0, sizeof(*(p))) + +/** Error check the file descriptor set. + */ +#define FD_BAD(fd) ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/) + +/*! Wait for both message queues and signals. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int pselect(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + const struct timespec *restrict timeout, + const sigset_t *restrict sigmask); + +/*! Wait for multiple message queues. In this implementation, only + * message queue file descriptors are supported. + * @param nfds [in] This is an integer one more than the maximum of any file + * descriptor in any of the sets. In other words, while you are busy + * adding file descriptors to your sets, you must calculate the maximum + * integer value of all of them, then increment this value by one, and + * then pass this as nfds to select(). + * @param readfds [in] the file descriptor set on all message queues. + * @param writefds [in] ignored in this implementation. + * @param errorfds [in] ignored in this implementation. + * @param timeout [in] Only timeout={0,0} is supported in this + * implementation. This behavior may change in the future. + */ +int select(int nfds, fd_set *restrict readfds, + fd_set *restrict writefds, fd_set *restrict errorfds, + struct timeval *restrict timeout); + +/** @} */ + +/* this function is needed for test framework which needs to clean up memory when teardown */ +void _mq_teardown(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread.h new file mode 100755 index 0000000000000..f64242e8dc683 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread.h @@ -0,0 +1,287 @@ +#ifndef QURT_PTHREAD_H +#define QURT_PTHREAD_H + +/*========================================================================== + * FILE: pthread.h + * + * SERVICES: POSIX pthread API interface + * + * DESCRIPTION: POSIX pthread API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016,2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *========================================================================== + * + * EDIT HISTORY FOR MODULE + * + * This section contains comments describing changes made to the module. + * Notice that changes are listed in reverse chronological order. + * + * + * + * when who what, where, why + * -------- --- ------------------------------------------------------- + * 10/13/08 cz Initial version. + *==========================================================================*/ + +#include +#include "sys/sched.h" /* For struct sched_param */ +#include "sys/errno.h" /* error values */ +#include +#include +#include +#include +#include +#include "pthread_types.h" +#ifdef __cplusplus +extern "C" { +#endif + +/* the range of the set supported by the kernel data type used to represent CPU sets. */ +#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL + +#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS) static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); } + +/** @brief Magic (non-portable) value for a stack's address to enable usage + of auto-stack feature (if available) */ +#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF) + +/** \details + * This provides POSIX thread API. + * + */ + +/** \defgroup pthread POSIX pthread API */ +/** \ingroup pthread */ +/** @{ */ + +/** Compare Two Threads. + * Please refer to POSIX standard for details. + */ +static inline int pthread_equal(pthread_t t1, pthread_t t2) +{ + return (t1 == t2) ? 1 : 0; +} + +/** Create Thread. + * Please refer to POSIX standard for details. + */ +int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg); + +/** Terminate Calling Thread. + * Please refer to POSIX standard for details. + */ +void pthread_exit(void *value_ptr); + +/** Wait for thread termination. + * Please refer to POSIX standard for details. + * @param thread [in] the thread to be joined + * @param value_ptr [out] the pointer of the exit status + */ +int pthread_join(pthread_t thread, void **value_ptr); + +/** Detach a joinable thread. + * Please refer to POSIX standard for details. + * @param id [in] id of the tread the thread to be detached. + */ +int pthread_detach(pthread_t id); + +/** Dynamic package initialisation + * Please refer to POSIX standard for details. + */ +int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)); + +pthread_t pthread_self(void); +int pthread_cancel(pthread_t thread); +static inline void pthread_yield(void) +{ + return; +} + +int pthread_kill(pthread_t thread, int sig); + +/** + * @brief Return name of thread + * @warning Donot call this in the error handling path as it may cause deadlock + * due to underlying OS calls + * @param thread [in] thread Thread whose name is to be retrieved + * @param name [out] name Buffer used to return thread name + * @param len [in] len Number of bytes available in name + * @return 0 on success, ESRCH, ERANGE on failure + */ +extern int pthread_getname_np (pthread_t thread, char * name, size_t len); + +int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param); +int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param); +int pthread_setschedprio(pthread_t thread, int prio); +int pthread_setcancelstate(int state, int *oldstate); +int pthread_setcanceltype(int type, int *oldtype); + +/* Attribute functions */ +int pthread_attr_init(pthread_attr_t *attr); +int pthread_attr_destroy(pthread_attr_t *attr); +int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param); +int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param); +int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize); +int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize); +int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr); +int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr); +int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate); +int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate); +int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize); +int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize); +int pthread_attr_setscope(pthread_attr_t *attr, int scope); +int pthread_attr_getscope(const pthread_attr_t *attr, int *scope); +int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched); +int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched); +int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize); +int pthread_attr_setautostack(pthread_attr_t *attr); +int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority); + +/* Qualcomm additions to pthread get/set attribute functions */ +int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name); +int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size); +int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid); +int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid); + +/* Mutexes */ +int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr); +int pthread_mutex_lock(pthread_mutex_t *mutex); +int pthread_mutex_unlock(pthread_mutex_t *mutex); +int pthread_mutex_trylock(pthread_mutex_t *mutex); +int pthread_mutex_destroy(pthread_mutex_t *mutex); +int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling); +int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling); + +/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not + * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support + * this kind of Mutex */ +int pthread_mutexattr_init(pthread_mutexattr_t *attr); +int pthread_mutexattr_destroy(pthread_mutexattr_t *attr); +int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type); +int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol); +int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict); +int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int); +int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling); +int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling); + +/* Spinlocks */ +int pthread_spin_init(pthread_spinlock_t *lock, int pshared); +int pthread_spin_destroy(pthread_spinlock_t *lock); +int pthread_spin_lock(pthread_spinlock_t *lock); +int pthread_spin_trylock(pthread_spinlock_t *lock); +int pthread_spin_unlock(pthread_spinlock_t *lock); + +/* Condition variables */ +int pthread_condattr_init(pthread_condattr_t *attr); +int pthread_condattr_destroy(pthread_condattr_t *attr); +int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared); +int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared); +int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock); +int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock); +int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr); +int pthread_cond_destroy(pthread_cond_t *cond); +int pthread_cond_signal(pthread_cond_t *cond); +int pthread_cond_broadcast(pthread_cond_t *cond); +int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); +int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time); + +/* Barriers */ +int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count); +int pthread_barrier_destroy(pthread_barrier_t *barrier); +int pthread_barrier_wait(pthread_barrier_t *barrier); +int pthread_barrierattr_init(pthread_barrierattr_t *attr); +int pthread_barrierattr_destroy(pthread_barrierattr_t *attr); +int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared); + + +/*Read-Write locks*/ +int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *); +int pthread_rwlock_destroy(pthread_rwlock_t *); +int pthread_rwlockattr_init(pthread_rwlockattr_t *); +int pthread_rwlockattr_destroy(pthread_rwlockattr_t *); +int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *); +int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int); +int pthread_rwlock_rdlock(pthread_rwlock_t *); +int pthread_rwlock_tryrdlock(pthread_rwlock_t *); +int pthread_rwlock_wrlock(pthread_rwlock_t *); +int pthread_rwlock_trywrlock(pthread_rwlock_t *); +int pthread_rwlock_unlock(pthread_rwlock_t *); + + +/** please refer to POSIX standard document + */ +int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared); + +/** set CPU affinity attribute in thread attributes object. + + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [in] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpuset specified a CPU that was outside the set supported + by the kernel. (The kernel configuration option + CONFIG_NR_CPUS defines the range of the set supported by + the kernel data type used to represent CPU sets.) + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset); + +/** get CPU affinity attribute in thread attributes object. + * @param attr [in] pthread attributes + * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) + of the buffer pointed to by cpuset. Typically, + this argument would be specified as + sizeof(cpu_set_t). + * @param cpuset [out] This data set is a bitset where each bit represents + a CPU (hw thread). How the system's CPUs are mapped + to bits in the bitset is system dependent. + For QURT kernel, Bit 0 is corresponding to hw + thread 0, and so on. If the corresponding bit is + set to 1, then the software thread is eligible to + run this hw thread. 0x3f means it can run any hw + threads 0x0 also means it can run on any hw threads. + @return On success, this function returns 0; on error, it returns a + non-zero error number. + EINVAL - cpusetsize is smaller than the size of the affinity mask + used by the kernel. + * @note This function is non-standard GNU extensions; hence the suffix "_np" + (non-portable) in the names. + */ +int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset); + +/* TLS */ +int pthread_key_create(pthread_key_t *key, void (*destructor)(void*)); +int pthread_key_delete(pthread_key_t key); +int pthread_setspecific(pthread_key_t key, const void *value); +void *pthread_getspecific(pthread_key_t key); +int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); + +/** @} */ + +/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */ +int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr); +int pthread_fake_destroy(pthread_t thread); + +//amitkulk: move these to unistd.h after we move that header within qurt +int posix_memalign(void **memptr, size_t alignment, size_t size); +void exit(int status); +#ifdef __cplusplus +} +#endif + +#endif /* QURT_PTHREAD_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread_types.h new file mode 100755 index 0000000000000..51c3b9dbca243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread_types.h @@ -0,0 +1,193 @@ +#ifndef _PTHREAD_TYPES_H_ +#define _PTHREAD_TYPES_H_ + +/*========================================================================== + * FILE: pthread_types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2016, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __GNUC__ +#define restrict __restrict__ +#else +#define restrict +#endif + +#define _SSIZE_T + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#define PTHREAD_MAX_THREADS 512U + +#define PTHREAD_NAME_LEN 16 +#define PTHREAD_MIN_STACKSIZE 512 //4096 +#define PTHREAD_MAX_STACKSIZE 1048576 +#define PTHREAD_DEFAULT_STACKSIZE 16384 + +#define PTHREAD_STACK_MIN (4096U*2U) +#define PTHREAD_MIN_PRIORITY 0U +#define PTHREAD_MAX_PRIORITY 255U +#define PTHREAD_DEFAULT_PRIORITY 1 + +/*Mutex initialization status*/ +#define PTHREAD_MUTEX_ATTR_UNINITIALIZED 0 +#define PTHREAD_MUTEX_ATTR_INITIALIZED 1 + +/*Conditional attributes initialization status*/ +#define PTHREAD_COND_ATTR_UNINITIALIZED 0 +#define PTHREAD_COND_ATTR_INITIALIZED 1 + +#define PTHREAD_DEFAULT_NAME "Anonymous" + +#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t) 0xFFFFFFFFU) + +#define PTHREAD_COND_INITIALIZER ((pthread_cond_t) 0xFFFFFFFFU) + +/* mutex and cond_var shared */ +#define PTHREAD_PROCESS_PRIVATE 0 +#define PTHREAD_PROCESS_SHARED 1 + +/* mutex type */ +#define PTHREAD_MUTEX_ERRORCHECK 0 +#define PTHREAD_MUTEX_NORMAL 1 +#define PTHREAD_MUTEX_RECURSIVE 2 +#define PTHREAD_MUTEX_DEFAULT 3 + +/* mutex protocol */ +#define PTHREAD_PRIO_NONE 0 +#define PTHREAD_PRIO_INHERIT 1 +#define PTHREAD_PRIO_PROTECT 2 + +#define PTHREAD_SPINLOCK_UNLOCKED 0 +#define PTHREAD_SPINLOCK_LOCKED 1 + +#define PTHREAD_ONCE_INIT (0) + +#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug + +typedef signed int ssize_t; + +/*detatchstate of a pthread*/ +#define PTHREAD_CREATE_JOINABLE 1 +#define PTHREAD_CREATE_DETACHED 0 + +/*contention scope*/ +#define PTHREAD_SCOPE_PROCESS 1 +#define PTHREAD_SCOPE_SYSTEM 0 + +/*scheduler*/ +#define PTHREAD_INHERIT_SCHED 1 +#define PTHREAD_EXPLICIT_SCHED 0 + +/* + * Types and structure definitions + * + */ +typedef unsigned int cpu_set_t; + +typedef unsigned int pthread_t; + +typedef struct pthread_attr_t +{ + void *stackaddr; + int internal_stack; /* this flag==1 means the stack needs to be freed by posix */ + size_t stacksize; + int priority; + unsigned short timetest_id; + /* This flag indicate if thread will be autostack thread*/ + unsigned short autostack:1; + /* This flag is to indicate thread's bus_priority high/low + bus_priority = 0 -- Bus_priority is low + bus_priority = 1 -- Bus_priority is high + bus_priority = 3 -- Bus_priority is default (takes the default set for the process) + */ + unsigned short bus_priority:2; + unsigned short reserved:13; + cpu_set_t cpumask; + char name[PTHREAD_NAME_LEN]; + /* This flag indicates whether pthread lib should create thread contexts for other OSALs */ + /* This is used internally by POSIX and not available for general usage */ + int ext_context; + int detachstate; +} pthread_attr_t; + +//mutex attr +typedef struct pthread_mutexattr_t pthread_mutexattr_t; +struct pthread_mutexattr_t +{ + int is_initialized; + int type; + int pshared; + int protocol; +}; + +typedef unsigned int pthread_mutex_t; + +typedef unsigned int pthread_spinlock_t; + +typedef struct pthread_condattr_t +{ + int is_initialized; + int pshared; + clockid_t clock_id; +} pthread_condattr_t; + +typedef unsigned int pthread_cond_t; + +typedef struct pthread_barrierattr_t +{ + int is_initialized; + int pshared; +} pthread_barrierattr_t; + +typedef unsigned int pthread_barrier_t; + +typedef int pthread_key_t; + +typedef int pthread_once_t; + + +/*Read-Write locks*/ +#define PTW32_RWLOCK_MAGIC 0xfacade2 +#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1) + +struct pthread_rwlockattr_t_ +{ + int pshared; +}; + +struct pthread_rwlock_t_ +{ + pthread_mutex_t mtxExclusiveAccess; + pthread_mutex_t mtxSharedAccessCompleted; + pthread_cond_t cndSharedAccessCompleted; + int nSharedAccessCount; + int nExclusiveAccessCount; + int nCompletedSharedAccessCount; + int nMagic; +}; + +typedef struct pthread_rwlock_t_ * pthread_rwlock_t; +typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t; +#ifdef __cplusplus +} +#endif + +#endif /* _PTHERAD_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sched.h new file mode 100755 index 0000000000000..faf3365be9f82 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sched.h @@ -0,0 +1,21 @@ +/*============================================================================= + + sched.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SCHED_H__ +#define __SCHED_H__ + +#include "sys/sched.h" + +#endif //__SCHED_H__ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/semaphore.h new file mode 100755 index 0000000000000..d9145b295ae62 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/semaphore.h @@ -0,0 +1,114 @@ +#ifndef SEMAPHORE_H +#define SEMAPHORE_H + +/*========================================================================== + * FILE: semaphore.h + * + * SERVICES: POSIX semaphore API interface + * + * DESCRIPTION: POSIX semaphore API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ +#include // Get all C sys types - includes POSIX specific +#include "sys/errno.h" // error values + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** User facing semaphore container with opaque pointer to implementation */ +typedef struct +{ + unsigned int *opaque; +} sem_t; +#define _SEM_T + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* constant definitions */ +#define SEM_FAILED ((sem_t*) 0) + +/* @todo siqbal Should we put such configuration items in a common place + instead of this user-facing header? */ +#define SEM_VALUE_MAX ((unsigned int) 30) // If need be increase this + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/** \details + * POSIX standard comes with two kinds of semaphores: named and unnamed + * semaphores. + * + * This implementation of POSIX kernel API provide unnamed & named semaphore. + * + * + * sem_timedwait() is not provided. + */ + +/** \defgroup semaphore POSIX Semaphore API */ + +/** \ingroup semaphore */ +/** @{ */ + +/** Initialize an unnamed semaphore. + * Please refer to POSIX standard for details. + * @param pshared [in] This implementation does not support non-zero value, + * i.e., semaphore cannot be shared between processes in this implementation. + */ +int sem_init(sem_t *sem, int pshared, unsigned int value); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_wait(sem_t *sem); + +/** Lock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_trywait(sem_t *sem); + +/** Unlock a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_post(sem_t *sem); + +/** Get the value of a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_getvalue(sem_t *sem, int *value); + +/** Destroy an unnamed semaphore. + * Please refer to POSIX standard for details. + */ +int sem_destroy(sem_t *sem); + +/** creates and initializes a named semaphore. + * Please refer to POSIX standard for details. + */ +sem_t * sem_open(const char* name , int oflag , ...); + +/** closes a semaphore. + * Please refer to POSIX standard for details. + */ +int sem_close(sem_t *sem); + +/** unlinkes a named semaphore. + * Please refer to POSIX standard for details. + */ +int sem_unlink(const char *name); +/** @} */ + + +#ifdef __cplusplus +} +#endif + +#endif /* SEMAPHORE_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/signal.h new file mode 100755 index 0000000000000..35cb1f1a9a319 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/signal.h @@ -0,0 +1,201 @@ +#ifndef _SIGNAL_H_ +#define _SIGNAL_H_ + +/*========================================================================== + * FILE: signal.h + * + * SERVICES: POSIX Signal API interface + * + * DESCRIPTION: POSIX Signal API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc. + * All Rights Reserved. + * Confidential and Proprietary - Qualcomm Technologies, Inc. + + *==========================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* POSIX signal bits */ + +#define POSIX_MSG 7 /* POSIX msg type used in Qube API */ +#define POSIX_NOTIF 8 /* POSIX msg type used in Qube API */ +#define SIGKILL 9 /* kill (cannot be caught or ignored) */ + +#define SIGRTMIN 10 +#define SIGRTMAX 32 + +/* Notification Types. */ +/* No asynchronous notification is delivered when the event of interest occurs. */ +#define SIGEV_NONE 0 +/* The signal specified in sigev_signo shall be generated for the process when + the event of interest occurs. */ +#define SIGEV_SIGNAL 1 +/* A notification function is called to perform notification. */ +#define SIGEV_THREAD 2 +#define SA_SIGINFO 1 + +/* + * Flags for sigprocmask: + */ +#define SIG_BLOCK 1 /* block specified signal set */ +#define SIG_UNBLOCK 2 /* unblock specified signal set */ +#define SIG_SETMASK 3 /* set specified signal set */ + +typedef unsigned long int sigset_t; + +union sigval +{ + int sival_int; /* Integer signal value. */ + void *sival_ptr; /* Pointer signal value. */ +}; + +typedef struct sigevent sigevent; +struct sigevent +{ + int sigev_notify; /* Notification type. */ + int sigev_signo; /* Signal number. */ + union sigval sigev_value; /* Signal value. */ + void (*sigev_notify_function)(union sigval); /* Notification function. */ + pthread_attr_t *sigev_notify_attributes; +}; + +typedef struct siginfo_t siginfo_t; +struct siginfo_t +{ + int si_signo; + int si_code; + union sigval si_value; +/* int si_errno; + pid_t si_pid; + uid_t si_uid; + void *si_addr; + int si_status; + long si_band;*/ +}; +struct sigaction +{ + void (*sa_handler)(int); + sigset_t sa_mask; + int sa_flags; + void (*sa_sigaction)(int, siginfo_t *, void *); +}; + +/* Signal functions */ + +/** \details + * This provides POSIX Signal API. Please note that this + * implementation does not fully comply with POSIX standard. + * + * In POSIX standard, Signal can be used as 'interrupt', which means + * an incoming signal will interrupt a running thread. After the + * registered signal handler is executed, the thread will resume. + * This behavior cannot be implemented w/o modifying L4 or QURT kernel. + * On the ohter hand, appliation need to be carefully written to avoid + * problems caused by 'interrupting' signals. + * + * Therefore, in this implementation of POSIX signal, thread will + * only receive signals when it explicitly waits for signals, i.e., when + * the thread calls either sigwait() or sigsuspend(). + * + * Therefore, pthread_sigmask(), which set or get signal mask for a thread, + * is not supported, since the signal mask will be set by sigwait() and + * sigsuspend(). + * + * Since this implementation of POSIX kernel API is a subset of PSE51, + * only threads can send and receive signals. The functions related to + * signal operations with processes, such as kill(), sigqueue(), + * sigprocmask(), are not provided. + * + * Queued signal is not supported. + * + * Applications will use signals from SIGRTMIN to SIGRTMAX. + * + * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not + * supported. + * + */ + +/** \defgroup signal POSIX Signal API */ +/** \ingroup signal */ +/** @{ */ + +/** Wait for signals. This implementation does not support queued signals. + * + * Please refer to POSIX standard for details. + */ +int sigwait(const sigset_t *restrict set, int *restrict sig); + +/** Examine and Change Signal Action. + * Please refer to POSIX standard for details. + * + * @param act [in] A pointer to the sigaction structure that describes the + * action to be taken for the signal. Can be NULL. + * The following flags for sa_flags field in struct sigaction are not + * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, + * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported. + * + * @note Define sigaction as macro to avoid a warning when included from + * C++ code - it's causing a "sigaction(...) hides constructor for + * 'struct sigaction'" warning. + */ +/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */ +#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact)) + +/** Wait for signals. + * Please refer to POSIX standard for details. + */ +int sigsuspend(const sigset_t *sigmask); + +/** Add Signal to Signal Set. + * Please refer to POSIX standard for details. + */ +int sigaddset(sigset_t *set, int signo); + +/** Delete Signal from Signal Set. + * Please refer to POSIX standard for details. + */ +int sigdelset(sigset_t *set, int signo); + +/** Initialize and Empty Signal Set. + * Please refer to POSIX standard for details. + */ +int sigemptyset(sigset_t *set); + +/** Initialize and Fill Signal Set. + * Please refer to POSIX standard for details. + */ +int sigfillset(sigset_t *set); + +/** Test for Signal in Signal Set. + * Please refer to POSIX standard for details. + */ +int sigismember(const sigset_t *set, int signo); + +/** @} */ + +/* this is not a public api function */ +int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact); + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +/** Wait for the time interval specified in the timespec structure referenced + * by timeout. This implementation does not support queued signals. + * For struct siginfo_t, si_code and si_value are ignored in this implementation. + * + * Please refer to POSIX standard for details. + */ +int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, + const struct timespec *restrict timeout); + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SIGNAL_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/errno.h new file mode 100755 index 0000000000000..b9edf57bab6c3 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/errno.h @@ -0,0 +1,20 @@ +#ifndef _SYS_ERRNO_H_ +#define _SYS_ERRNO_H_ + +/*========================================================================== + * FILE: errno.h + * + * SERVICES: POSIX errno header file + * + * DESCRIPTION: POSIX errno based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#include +#ifndef EOK +#define EOK 0 +#endif + +#endif /* _SYS_ERRNO_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/sched.h new file mode 100755 index 0000000000000..2acc34d821725 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/sched.h @@ -0,0 +1,67 @@ +#ifndef _POSIX_SCHED_H_ +#define _POSIX_SCHED_H_ + +/*========================================================================== + * FILE: sched.c + * + * SERVICES: POSIX Thread sched API interface + * + * DESCRIPTION: POSIX Thread sched API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + + *==========================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCHED_FIFO 0 /* First in, first out (FIFO) scheduling policy. */ +#define SCHED_RR 1 /* Round robin scheduling policy. */ +#define SCHED_SPORADIC 2 /* Sporadic server scheduling policy. */ +#define SCHED_OTHER 3 /* Another scheduling policy. */ + +typedef struct sched_param sched_param; +struct sched_param +{ + void *unimplemented; + int sched_priority; +}; + +/** \details + * This provides POSIX sched API. + */ + +/** \defgroup sched POSIX sched API */ +/** \ingroup sched */ +/** @{ */ + +/** Relinquish the CPU. + * Please refer to POSIX standard for details. + */ +static inline int sched_yield(void) +{ + return 0; +} + +/** Get the maximum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_max(int policy); + +/** Get the minimum priority. + * Please refer to POSIX standard for details. + * @param policy [in] SCHED_FIFO is the only valid input for this implementation. + */ +int sched_get_priority_min(int policy); + +/** @} */ +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_SCHED_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/types.h new file mode 100755 index 0000000000000..700026f9f9e4e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/types.h @@ -0,0 +1,35 @@ +#ifndef _SYS_TYPES_H_ +#define _SYS_TYPES_H_ + +/*========================================================================== + * FILE: types.c + * + * SERVICES: types usded in POSIX API interface + * + * DESCRIPTION: POSIX API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013, 2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + + *==========================================================================*/ + +#if !defined( _PID_T ) || !defined( __pid_t_defined ) +/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header + defines it as unsigned 32-bit type citing conflict with QuRT POSIX + compatibility later. If any such conflicts exist, we should fix them. + pid_t is being defined *BEFORE* inclusion of generic/sys/types.h + *INTENTIONALLY* to fix this */ +typedef int pid_t; +#define _PID_T +#define __pid_t_defined +#endif +#include +#include +#include +#include + +#ifndef __DEFINED_off_t +typedef long off_t; +#define __DEFINED_off_t +#endif + +#endif /* _SYS_TYPES_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/time.h new file mode 100755 index 0000000000000..13aeb1ea9920d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/time.h @@ -0,0 +1,142 @@ +#ifndef _POSIX_TIME_H_ +#define _POSIX_TIME_H_ + +/*========================================================================== + * FILE: time.h + * + * SERVICES: POSIX Timer API interface + * + * DESCRIPTION: POSIX Timer API interface based upon POSIX 1003.1-2004 + * + * Copyright (c) 2013,2016 by Qualcomm Technologies, Inc. All Rights Reserved. QUALCOMM Proprietary and Confidential. + *==========================================================================*/ + + +#include + +typedef int clockid_t; /* ignored */ +#define _CLOCKID_T +#define _PROVIDE_POSIX_TIME_DECLS 1 +#include +/* @todo anandj sys/time.h has definition for struct timeval but is not + included by generic/time.h */ +#include + +#define CLOCK_FREQ_NOT_DEFINED -1 +/* Frequency of Sclk used */ +#define TIME_CONV_SCLK_FREQ 19200000 + +#define RES_CONV_FACTOR1 1 +#define RES_CONV_FACTOR2 1000000000 + +#if !defined(CLOCK_REALTIME) +# define CLOCK_REALTIME 0 +#endif + +#if !defined(CLOCK_MONOTONIC) +# define CLOCK_MONOTONIC 1 +#endif + +#if !defined(CLOCK_THREAD_CPUTIME_ID) +# define CLOCK_THREAD_CPUTIME_ID 2 +#endif + +#if !defined(CLOCK_PROCESS_CPUTIME_ID) +# define CLOCK_PROCESS_CPUTIME_ID 3 +#endif + +#if !defined(CLOCK_MONOTONIC_RAW) +# define CLOCK_MONOTONIC_RAW 4 +#endif + +#if !defined(CLOCK_REALTIME_COARSE) +# define CLOCK_REALTIME_COARSE 5 +#endif + +#if !defined(CLOCK_MONOTONIC_COARSE) +# define CLOCK_MONOTONIC_COARSE 6 +#endif + +#if !defined(CLOCK_BOOTTIME) +# define CLOCK_BOOTTIME 7 +#endif + +struct itimerspec +{ + struct timespec it_interval; /* Timer period. */ + struct timespec it_value; /* Timer expiration. */ +}; + +/* have to move #include here to solve circular include problems between time.h and signal.h */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Timer functions */ + +/** \details + * POSIX timers can be either of two types: a one-shot type or a periodic + * type. + * + * A one-shot is an armed timer that is set to an expiration time relative + * to either a current time or an absolute time. The timer expires once and + * is disarmed. + * + * A periodic timer is armed with an initial expiration time and a repetition + * interval. Every time the interval timer + * expires, the timer is reloaded with the repetition interval. The timer + * is then rearmed. + */ + +/** \defgroup timer POSIX Timer API */ + +/** \ingroup timer */ +/** @{ */ + +/** Create a POSIX timer. + * Please refer to POSIX standard for details. + * @param clockid [in] ignored in this implementation + * @param evp [in] if non-NULL, points to a sigevent structure. This + * structure, allocated by the application, defines the asynchronous + * notification to occur when the timer expires. If the evp argument is + * NULL, the effect is as if the evp argument pointed to a sigevent + * structure with the sigev_notify member having the value SIGEV_SIGNAL, + * the sigev_signo having a default signal number (SIGALRM), and the + * sigev_value member having the value of the timer ID. + */ +int timer_create(clockid_t clockid, struct sigevent *restrict evp, + timer_t *restrict timerid); + +/** Delete a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_delete(timer_t timerid); + +/** Get the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + */ +int timer_gettime(timer_t timerid, struct itimerspec *value); + + +/** Set the time remaining on a POSIX timer. + * Please refer to POSIX standard for details. + * @param flags [in] ignored in this implementation + */ +int timer_settime(timer_t timerid, int flags, + const struct itimerspec *restrict value, + struct itimerspec *restrict ovalue); +/** Obtain ID of a process CPU-time clock + * @param pid [in] Process ID + * @param clock_id [out] Clock ID + * @return Error values as per POSIX standard + */ +int clock_getcpuclockid (pid_t pid, clockid_t * clock_id); +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* _POSIX_TIME_H_ */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qube/qube.h new file mode 100755 index 0000000000000..1e31e2deedb38 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qube/qube.h @@ -0,0 +1,51 @@ +#ifndef QUBE_H +#define QUBE_H +/*============================================================================= + + qube.h -- H E A D E R F I L E + +GENERAL DESCRIPTION + Prototypes of qpd API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Define Error codes as QuRT error codes preceed with QURT_ */ +#ifndef EOK +#define EOK QURT_EOK +#endif /* EOK */ +#ifndef EVAL +#define EVAL QURT_EVAL +#endif /* EVAL */ +#ifndef EMEM +#define EMEM QURT_EMEM +#endif /* EMEM */ +#ifndef EINVALID +#define EINVALID QURT_EINVALID +#endif /* EINVALID */ + + +/*============================================================================= + FUNCTION DECLARATIONS +=============================================================================*/ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QUBE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops.h new file mode 100755 index 0000000000000..0a9a9f8ba7db5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops.h @@ -0,0 +1,197 @@ +#ifndef ATOMIC_OPS_H +#define ATOMIC_OPS_H +/** + @file atomic_ops.h + + @brief Type definitions backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * Author: Carlos Dyonisio + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned int atomic_plain_word_t; + +/*-------------------------------------------------------------------------*/ + /* Atomic Ops API. */ + +/* + * IMPORTANT! + * If you plan to change the structure atomic_word_t, please add the new + * elements after value. For more information, read the comment in + * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66 + */ + +typedef struct { + volatile atomic_plain_word_t value; +} atomic_word_t; + +#define ATOMIC_INIT(i) { (i) } + +static inline void +atomic_init(atomic_word_t *a, atomic_plain_word_t v) +{ + a->value = v; +} + +#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \ + (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP)) + +/* + * If it is ARMv4/v5, the function declarations may change + * and are defined in the arch specific header file, + * as some of then cannot be declared static because of + * the assembler implementation. + */ + +#else + +/* Arithmetic operations. */ + +void atomic_sub(atomic_word_t *target, atomic_plain_word_t v); + +/* Architecture independent definitions. */ + +static inline atomic_plain_word_t atomic_read(atomic_word_t *target) +{ + return target->value; +} + +typedef unsigned long long atomic64_plain_word_t; + +typedef struct { + volatile atomic64_plain_word_t value; +} atomic64_word_t; + +static inline void +atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v) +{ + a->value = v; +} + +/********************* + Support 64-bit + *********************/ + +atomic64_plain_word_t atomic64_set(atomic64_word_t* target, + atomic64_plain_word_t value); + +void atomic64_xor(atomic64_word_t* target, + atomic64_plain_word_t mask); + +/*---------------------------------------------------------------------------*/ + +/* Architecture independent definitions. */ + +static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target) +{ + return target->value; +} + +#endif + + +/* Architecture dependent definitions. */ +#include + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops_plat.h new file mode 100755 index 0000000000000..b54b3ff83d978 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops_plat.h @@ -0,0 +1,86 @@ +#ifndef ATOMIC_OPS_PLAT_H +#define ATOMIC_OPS_PLAT_H +/** + @file atomic_ops_plat.h + + @brief Prototypes of atomic operations API backwards compatible. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define atomic_set(a,b) qurt_atomic_set((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and(a,b) qurt_atomic_and((unsigned int *)(a),(unsigned int)(b)) +#define atomic_and_return(a,b) qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or(a,b) qurt_atomic_or((unsigned int *)(a),(unsigned int)(b)) +#define atomic_or_return(a,b) qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor(a,b) qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b)) +#define atomic_xor_return(a,b) qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_set_bit(a,b) qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_clear_bit(a,b) qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_change_bit(a,b) qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add(a,b) qurt_atomic_add((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_return(a,b) qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_add_unless(a,b,c) qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_sub(a,b) qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b)) +#define atomic_sub_return(a,b) qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b)) +#define atomic_inc(a) qurt_atomic_inc((unsigned int *)(a)) +#define atomic_inc_return(a) qurt_atomic_inc_return((unsigned int *)(a)) +#define atomic_dec(a) qurt_atomic_dec((unsigned int *)(a)) +#define atomic_dec_return(a) qurt_atomic_dec_return((unsigned int *)(a)) +#define atomic_compare_and_set(a,b,c) qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c)) +#define atomic_barrier qurt_atomic_barrier +#define atomic_barrier_write qurt_atomic_barrier_write +#define atomic_barrier_write_smp qurt_atomic_barrier_write_smp +#define atomic_barrier_read_smp qurt_atomic_barrier_read_smp +#define atomic_barrier_smp qurt_atomic_barrier_smp + +/*============================ + * 64 bits support + *============================ */ +#define atomic64_set(a,b) qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and(a,b) qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_and_return(a,b) qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or(a,b) qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_or_return(a,b) qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor(a,b) qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_xor_return(a,b) qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_set_bit(a,b) qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_clear_bit(a,b) qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_change_bit(a,b) qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add(a,b) qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_add_return(a,b) qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub(a,b) qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_sub_return(a,b) qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) +#define atomic64_inc(a) qurt_atomic64_inc((unsigned long long *)(a)) +#define atomic64_inc_return(a) qurt_atomic64_inc_return((unsigned long long *)(a)) +#define atomic64_dec(a) qurt_atomic64_dec((unsigned long long *)(a)) +#define atomic64_dec_return(a) qurt_atomic64_dec_return((unsigned long long *)(a)) +#define atomic64_compare_and_set(a,b,c) qurt_atomic64_compare_and_set((unsigned long long *)(a),(unsigned long long )(b),(unsigned long long )(c)) +#define atomic64_barrier qurt_atomic64_barrier +#define atomic64_barrier_write qurt_atomic64_barrier_write +#define atomic64_barrier_write_smp qurt_atomic64_barrier_write_smp +#define atomic64_barrier_read_smp qurt_atomic64_barrier_read_smp +#define atomic64_barrier_smp qurt_atomic64_barrier_smp + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* ATOMIC_OPS_PLAT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt.h new file mode 100755 index 0000000000000..4d25c9b2b6243 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt.h @@ -0,0 +1,111 @@ +#ifndef QURT_H +#define QURT_H + +/** + @file qurt.h + @brief Contains kernel header files that provide kernel OS API functions, constants, and + definitions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013,2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +/*====================================================================== + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Notice that changes are listed in reverse chronological + * order. + * + * + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------ + * 2011-02-25 op Add Header file + 2012-12-16 cm (Tech Pubs) Edited/added Doxygen comments and markup. + ======================================================================*/ + + +#ifdef __cplusplus +extern "C" { +#endif + +#include "qurt_consts.h" +#include "qurt_api_version.h" +#include "qurt_alloc.h" +#include "qurt_futex.h" +#include "qurt_mutex.h" +#include "qurt_pipe.h" +#include "qurt_printf.h" +#include "qurt_assert.h" +#include "qurt_thread.h" +#include "qurt_trace.h" +#include "qurt_cycles.h" +#include "qurt_profile.h" +#include "qurt_sem.h" +#include "qurt_cond.h" +#include "qurt_barrier.h" +#include "qurt_fastint.h" +#include "qurt_allsignal.h" +#include "qurt_anysignal.h" +#include "qurt_signal.h" +#include "qurt_rmutex.h" +#include "qurt_pimutex.h" +#include "qurt_signal2.h" +#include "qurt_rmutex2.h" +#include "qurt_pimutex2.h" +#include "qurt_int.h" +#include "qurt_lifo.h" +#include "qurt_power.h" +#include "qurt_event.h" +#include "qurt_pmu.h" +#include "qurt_stid.h" +//#include "qurt_version.h" +#include "qurt_tlb.h" +#include "qurt_vtlb.h" +#include "qurt_memory.h" +#include "qurt_qdi.h" +#include "qurt_sclk.h" +#include "qurt_space.h" +#include "qurt_process.h" +#include "qurt_timer.h" +#include "qurt_tls.h" +#include "qurt_thread_context.h" +#include "qurt_hvx.h" +#include "qurt_hmx.h" +#include "qurt_mailbox.h" +#include "qurt_island.h" +#include "qurt_qdi_proxy.h" +#include "qurt_l2cfg.h" +#include "qurt_mmap.h" +#include "qurt_isr.h" +#include "qurt_busywait.h" +#include "qurt_ecc.h" +#include "qurt_callback.h" +#include "qurt_error.h" +#include "qurt_except.h" +#include "qurt_mq.h" +#include "qurt_user_dma.h" +#include "qurt_fs_hub.h" +#include "qurt_os_services.h" + +#ifndef MAIN_ONLY +#define INCLUDE_ISLAND_CONTENTS +#endif +#ifndef ISLAND_ONLY +#define INCLUDE_MAIN_CONTENTS +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_alloc.h new file mode 100755 index 0000000000000..da37a4c0a714e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_alloc.h @@ -0,0 +1,145 @@ +#ifndef QURT_ALLOC_H +#define QURT_ALLOC_H + +/** + @file qurt_alloc.h + @brief Prototypes of kernel memory allocation API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_malloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated memory area. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] size Size (in bytes) of the memory area. + + @return + Nonzero -- Pointer to the allocated memory area. \n + 0 -- Not enough memory in heap to allocate memory area. + + @dependencies + None. + + */ +/* ======================================================================*/ +void *qurt_malloc( unsigned int size); + +/*======================================================================*/ +/**@ingroup func_qurt_calloc + Dynamically allocates the specified array on the QuRT system heap. + The return value is the address of the allocated array. + + @note1hang The allocated memory area is automatically initialized to zero. + + @param[in] elsize Size (in bytes) of each array element. + @param[in] num Number of array elements. + + @return + Nonzero -- Pointer to allocated array.\n + Zero -- Not enough memory in heap to allocate array. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_calloc(unsigned int elsize, unsigned int num); + +/*======================================================================*/ +/**@ingroup func_qurt_realloc + Reallocates memory on the heap. \n + Changes the size of a memory area that is already allocated on the QuRT system heap. + The reallocate memory operation is functionally similar to realloc. It accepts a pointer + to an existing memory area on the heap, and resizes the memory area to the specified size + while preserving the original contents of the memory area. + + @note1hang This function might change the address of the memory area. + If the value of ptr is NULL, this function is equivalent to + qurt_malloc(). + If the value of new_size is 0, it is equivalent to qurt_free(). + If the memory area is expanded, the added memory is not initialized. + + @param[in] *ptr Pointer to the address of the memory area. + @param[in] newsize Size (in bytes) of the reallocated memory area. + + @return + Nonzero -- Pointer to reallocated memory area. \n + 0 -- Not enough memory in heap to reallocate the memory area. + + @dependencies + None. + + */ + /* ======================================================================*/ +void *qurt_realloc(void *ptr, int newsize); + +/*======================================================================*/ +/**@ingroup func_qurt_free + Frees allocated memory from the heap.\n + Deallocates the specified memory from the QuRT system heap. + + @param[in] *ptr Pointer to the address of the memory to deallocate. + + @return + None. + + @dependencies + The memory item that the ptr value specifies must have been previously + allocated using one of the qurt_calloc(), + qurt_malloc(), or qurt_realloc() memory allocation functions. + Otherwise the behavior of QuRT is undefined. + + */ + /* ======================================================================*/ +void qurt_free( void *ptr); + + +void *qurt_memalign(unsigned int alignment, unsigned int size); + +/* +|| Macro to define a static heap for a QuRT program. +|| +|| Usage: +|| Declare at the top-level of any C source file that +|| is part of the build (and is guaranteed +|| to actually be pulled into the build). Place +|| it in the same function with main(): +|| +|| QURT_DECLARE_STATIC_HEAP(512000); +|| +|| The only argument is the size in bytes, and it is +|| rounded up to the nearest 64 bytes (size of an +|| L2 cache block). +|| +*/ + +#define QURT_DECLARE_STATIC_HEAP(sz) \ + static struct qurt_static_heap { \ + char space[(sz)] __attribute__((aligned(64))); \ + } static_heap[1]; \ + void * const override_heap_Base = &static_heap[0]; \ + void * const override_heap_Limit = &static_heap[1] + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLOC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_allsignal.h new file mode 100755 index 0000000000000..5dc89e495130d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_allsignal.h @@ -0,0 +1,176 @@ + +#ifndef QURT_ALLSIGNAL_H +#define QURT_ALLSIGNAL_H + +/** + @file qurt_allsignal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup all_signal_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** +qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int waiting; /**< */ + unsigned int signals_in; /**< */ + unsigned int queue; /**< */ + unsigned int reserved; /**< */ + }X; + /** @endcond */ +} qurt_allsignal_t; +/** @} */ /* end_addtogroup all_signal_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_init + Initializes an all-signal object.\n + The all-signal object is initially cleared. + + @datatypes + #qurt_allsignal_t + + @param[out] signal Pointer to the all-signal object to initialize. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_init(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_destroy + Destroys the specified all-signal object.\n + @note1hang All-signal objects must be destroyed when they are no longer in use. + Failure to do this causes resource leaks in the QuRT kernel. \n + @note1cont All-signal objects must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_allsignal_destroy(qurt_allsignal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_get + Gets signal values from the all-signal object. + + Returns the current signal values of the specified all-signal object. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to access. + + @return + Bitmask with current signal values. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal) +{ return signal->X.signals_in; } + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_wait + Waits on the all-signal object.\n + Suspends the current thread until all of the specified signals are set. + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 that it is not to be waited on. + + If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal + object before waiting on them again -- clearing is done automatically by the wait + operation. + + @note1hang At most, one thread can wait on an all-signal object at any given time. + Because signal clearing is done by the wait operation, no clear operation is + defined for all-signals. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to wait on. + @param[in] mask Signal mask value, which identifies the individual signals in the all-signal object + to wait on. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_allsignal_set + Set signals in the specified all-signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit + value of 1 indicates that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_allsignal_t + + @param[in] signal Pointer to the all-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the all-signal object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ALLSIGNAL_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_anysignal.h new file mode 100755 index 0000000000000..9619e2de562b4 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_anysignal.h @@ -0,0 +1,225 @@ +#ifndef QURT_ANYSIGNAL_H +#define QURT_ANYSIGNAL_H +/** + @file qurt_anysignal.h + Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + +Copyright (c) 2021 Qualcomm Technologies, Inc. +All rights reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== +Typedefs +======================================================================*/ + +/**@ingroup anysignals_types + qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility. */ +typedef qurt_signal_t qurt_anysignal_t; + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_init + Initializes an any-signal object.\n + The any-signal object is initially cleared. + + @datatypes + #qurt_anysignal_t + + @param[out] signal Pointer to the initialized any-signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_init(qurt_anysignal_t *signal) +{ + qurt_signal_init(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_destroy + Destroys the specified any-signal object. + + @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Any-signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal) +{ + qurt_signal_destroy(signal); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait + Wait on the any-signal object. \n + Suspends the current thread until any one of the specified signals is set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + + @return + Bitmask of current signal values. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_set + Sets signals in the specified any-signal object. \n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be set, and 0 indicates not to set the sigmal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + set in the any-signal object. + + @return + Bitmask of old signal values (before set). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask); + + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_get + Gets signal values from the any-signal object.\n + Returns the current signal values of the specified any-signal object. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to access. + + @return + A bitmask with the current signal values of the specified any-signal object. + + @dependencies + None. + */ +/* ======================================================================*/ +static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal) +{ + return qurt_signal_get(signal); +} + + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_clear + @xreflabel{sec:anysignal_clear} + Clears signals in the specified any-signal object.\n + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. + @param[in] mask Signal mask value identifying the individual signals to + clear in the any-signal object. + + @return + Bitmask -- Old signal values (before clear). + + @dependencies + None. + */ +/* ======================================================================*/ +unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_anysignal_wait_timed + Waits on the any-signal object. \n + Suspends the current thread until any of the specified signals is set or timeout expires. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait on the signal. + If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for + that signal, the thread is awakened. If the awakened thread has higher priority than + the current thread, a context switch can occur. + + @note1hang At most, one thread can wait on an any-signal object at any given time. + + @datatypes + #qurt_anysignal_t + + @param[in] signal Pointer to the any-signal object to wait on. + @param[in] mask Signal mask value, which specifies the individual signals in the any-signal + object to wait on. + @param[out] signals Bitmask of current signal values. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- timeout + #QURT_EINVALID -- Duration out of range + + @dependencies + None. + */ +/* ======================================================================*/ + +int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ANYSIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_api_version.h new file mode 100755 index 0000000000000..dfe53ae755054 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_api_version.h @@ -0,0 +1,77 @@ +#ifndef QURT_API_VERSION_H +#define QURT_API_VERSION_H +/*============================================================================== + +qurt_api_version.h + +GENERAL DESCRIPTION + API version file + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ + +/*============================================================================== + CONSTANTS AND DEFINITIONS +==============================================================================*/ +/** + * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer. + * Main release has first 3 fields updated - Major, Minor and Release. + * - QURT_API_VERSION = Major, Minor, Release. + * Patch releases are supported by adding the extra field. + * - QURT_API_VERSION = Major, Minor, Release, Patch. + */ +// Major version is incremented for incompatible API changes. +#define QURT_API_VER_MAJOR 1 + +// Minor version is incremented for backward-compatible enhancements in the API +// set. +#define QURT_API_VER_MINOR 4 + +// RELEASE version is incremented for each release within a `MAJOR.MINOR` +// release. +#define QURT_API_VER_RELEASE 1 + +// Patch version is incremented when new API content is introduced on older LTS +// release. +#define QURT_API_VER_PATCH 0 + +/* Update the QURT_API_VERSION function macro. */ +#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \ + ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \ + (((release) & 0xFF) << 8) | ((patch) & 0xFF)) + +/* Update the QURT_API_VERSION Macro. */ +#define QURT_API_VERSION \ + QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \ + QURT_API_VER_RELEASE, QURT_API_VER_PATCH) + +/** Usage: + * + * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0) + * qurt_func_2(a,b,c); + * #else + * qurt_func(a); + * #endif + * + */ +/* + Gets the QuRT API version. + + @return + QuRT API version. + + @dependencies + None. + */ +unsigned int qurt_api_version(void); + +#endif /* QURT_API_VERSION_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_assert.h new file mode 100755 index 0000000000000..13cc2afd2e973 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_assert.h @@ -0,0 +1,51 @@ +#ifndef QURT_ASSERT_H +#define QURT_ASSERT_H +/** + @file qurt_assert.h + @brief Prototypes of qurt_assert API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@ingroup func_qurt_assert_error + Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel. + + @datatypes + None. + + @param[in] filename Pointer to the file name string. + @param[in] lineno Line number. + + @return + None. + + @dependencies + None. + */ +void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn)); + +#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__)) + +/** @} */ /* end_ingroup func_qurt_assert */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ASSERT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_atomic_ops.h new file mode 100755 index 0000000000000..d9b2cff7d737c --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_atomic_ops.h @@ -0,0 +1,1298 @@ +#ifndef QURT_ATOMIC_OPS_H +#define QURT_ATOMIC_OPS_H +/** + @file qurt_atomic_ops.h + @brief Prototypes of kernel atomic operations API. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/* + * Australian Public Licence B (OZPLB) + * + * Version 1-0 + * + * Copyright (c) 2007, Open Kernel Labs, Inc. + * + * All rights reserved. + * + * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS) + * National ICT Australia + * http://www.ertos.nicta.com.au + * + * Permission is granted by National ICT Australia, free of charge, to + * any person obtaining a copy of this software and any associated + * documentation files (the "Software") to deal with the Software without + * restriction, including (without limitation) the rights to use, copy, + * modify, adapt, merge, publish, distribute, communicate to the public, + * sublicense, and/or sell, lend or rent out copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject + * to the following conditions: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimers in the documentation and/or other materials provided + * with the distribution. + * + * * Neither the name of National ICT Australia, nor the names of its + * contributors, may be used to endorse or promote products derived + * from this Software without specific prior written permission. + * + * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT + * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND + * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS, + * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS + * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, + * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF + * ERRORS, WHETHER OR NOT DISCOVERABLE. + * + * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL + * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL + * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER + * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR + * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS + * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR + * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, + * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN + * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS + * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS, + * DAMAGES OR OTHER LIABILITY. + * + * If applicable legislation implies representations, warranties, or + * conditions, or imposes obligations or liability on National ICT + * Australia or one of its contributors in respect of the Software that + * cannot be wholly or partly excluded, restricted or modified, the + * liability of National ICT Australia or the contributor is limited, to + * the full extent permitted by the applicable legislation, at its + * option, to: + * a. in the case of goods, any one or more of the following: + * i. the replacement of the goods or the supply of equivalent goods; + * ii. the repair of the goods; + * iii. the payment of the cost of replacing the goods or of acquiring + * equivalent goods; + * iv. the payment of the cost of having the goods repaired; or + * b. in the case of services: + * i. the supplying of the services again; or + * ii. the payment of the cost of having the services supplied again. + * + * The construction, validity and performance of this licence is governed + * by the laws in force in New South Wales, Australia. + */ + +/* + * Author: Malcolm Purvis + * + * This file is only included by the main atomic_ops.h, so all of that + * file's definitions are available. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +///* Sanity check to ensure the smp flag is set in machines.py */ +//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1 +//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py. +//#endif +#define QURT_INLINE __attribute__((always_inline)) + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_atomic_set + Sets the atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value Value to set. + + @return + Value successfuly set. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_set(unsigned int* target, unsigned int value) +{ + unsigned long tmp; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " memw_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic_and + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + None + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_and(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_and_return + Bitwise AND operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise AND. + + @return + AND result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_and_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_or + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_or(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_or_return + Bitwise OR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise OR. + + @return + Returns the OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_or_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_xor + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_xor(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic_xor_return + Bitwise XOR operation of the atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask Mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_xor_return(unsigned int* target, unsigned int mask) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_set_bit + Sets a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_set_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_clear_bit + Clears a bit in the atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_clear_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_change_bit + Toggles a bit in a atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_change_bit(unsigned int *target, unsigned int bit) +{ + unsigned int result; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1fU; + unsigned int *wtarget= (unsigned int *)&target[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_add(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_add_return + Adds an integer to atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_add_unless + Adds the delta value to an atomic variable unless the current value in the target + matches the unless variable. + + @note1hang The function retries until load lock and store conditional + are successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] delta Value to add to the current value. + @param[in] unless Perform the addition only when the current value is not + equal to this unless value. + @return + TRUE -- 1 - Addition was performed. \n + FALSE -- 0 - Addition was not done. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_add_unless(unsigned int* target, + unsigned int delta, + unsigned int unless) +{ + unsigned int current_val; + unsigned int new_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%3)\n" + " p0 = cmp.eq(%0, %5)\n" + " if p0 jump 2f\n" + " %1 = add(%0, %4)\n" + " memw_locked(%3, p0) = %1\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"=&r" (new_val),"+m" (*target) + : "r" (target), "r" (delta), "r" (unless) + : "p0"); + + return (unsigned int)(current_val != unless); +} + +/**@ingroup func_qurt_atomic_sub + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_sub(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic_sub_return + Subtracts an integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v Integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_sub_return(unsigned int *target, unsigned int v) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_inc + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_inc(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_inc_return + Increments an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_inc_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_dec + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_dec(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); +} + +/**@ingroup func_qurt_atomic_dec_return + Decrements an atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_dec_return(unsigned int *target) +{ + unsigned int result; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = add(%0, #-1)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic_compare_and_set + Compares the current value of the atomic variable with the + specified value and set to a new value when compare is successful. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val Old value to compare. + @param[in] new_val New value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE --Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned int +qurt_atomic_compare_and_set(unsigned int* target, + unsigned int old_val, + unsigned int new_val) +{ + unsigned int current_val; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memw_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (unsigned int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic_barrier + Allows the compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic_barrier(void) +{ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); +} + + +/**@ingroup func_qurt_atomic64_set + Sets the 64-bit atomic variable with the specified value. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] value 64-bit value to set. + + @return + Successfuly set value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_set(unsigned long long* target, unsigned long long value) +{ + unsigned long long tmp; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " memd_locked(%2, p0) = %3\n" + " if !p0 jump 1b\n" + : "=&r" (tmp),"+m" (*target) + : "r" (target), "r" (value) + : "p0"); + return value; +} + +/**@ingroup func_qurt_atomic64_and_return + Bitwise AND operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise AND. + + @return + AND result of 64-bit atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = and(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_or + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_or(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_or_return + Bitwise OR operation of a 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise OR. + + @return + OR result of the atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = or(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_xor_return + Bitwise XOR operation of 64-bit atomic variable with mask. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] mask 64-bit mask for bitwise XOR. + + @return + XOR result of atomic variable with mask. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = xor(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (mask) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_set_bit + Sets a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to set. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = setbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_clear_bit + Clears a bit in a 64-bit atomic variable at a specified position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to clear. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = clrbit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget), "r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_change_bit + Toggles a bit in a 64-bit atomic variable at a bit position. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] bit Bit position to toggle. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit) +{ + unsigned int result; + unsigned int *wtarget; + unsigned int *pwtarget = (unsigned int *)target; + unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); + unsigned int sbit = bit & 0x1FU; + wtarget = (unsigned int *)&pwtarget[aword]; + + __asm__ __volatile__( + "1: %0 = memw_locked(%2)\n" + " %0 = togglebit(%0, %3)\n" + " memw_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*wtarget) + : "r" (wtarget),"r" (sbit) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_add(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_add_return + Adds a 64-bit integer to 64-bit atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to add. + + @return + Result of arithmetic sum. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_add_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_sub_return + Subtracts a 64-bit integer from an atomic variable. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] v 64-bit integer value to subtract. + + @return + Result of arithmetic subtraction. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v) +{ + unsigned long long result; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = sub(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target), "r" (v) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_inc + Increments a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_inc(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); +} + +/**@ingroup func_qurt_atomic64_inc_return + Increments a 64-bit atomic variable by one + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Incremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_inc_return(unsigned long long *target) +{ + unsigned long long result; + unsigned long long inc =1; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (inc) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_dec_return + Decrements a 64-bit atomic variable by one. + + @note1hang The function retries until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + + @return + Decremented value. + + @dependencies + None. +*/ +static inline QURT_INLINE unsigned long long +qurt_atomic64_dec_return(unsigned long long *target) +{ + unsigned long long result; + long long minus1 = 0xFFFFFFFFFFFFFFFFLL; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " %0 = add(%0, %3)\n" + " memd_locked(%2, p0) = %0\n" + " if !p0 jump 1b\n" + : "=&r" (result),"+m" (*target) + : "r" (target),"r" (minus1) + : "p0"); + + return result; +} + +/**@ingroup func_qurt_atomic64_compare_and_set + Compares the current value of an 64-bit atomic variable with + the specified value and sets to a new value when compare is successful. + + @note1hang The function keep retrying until load lock and store conditional + is successful. + + @param[in,out] target Pointer to the atomic variable. + @param[in] old_val 64-bit old value to compare. + @param[in] new_val 64-bit new value to set. + + @return + FALSE -- Specified value is not equal to the current value. \n + TRUE -- Specified value is equal to the current value. + + @dependencies + None. +*/ +static inline QURT_INLINE int +qurt_atomic64_compare_and_set(unsigned long long *target, + unsigned long long old_val, + unsigned long long new_val) +{ + unsigned long long current_val; + + __asm__ __volatile__( + "1: %0 = memd_locked(%2)\n" + " p0 = cmp.eq(%0, %3)\n" + " if !p0 jump 2f\n" + " memd_locked(%2, p0) = %4\n" + " if !p0 jump 1b\n" + "2:\n" + : "=&r" (current_val),"+m" (*target) + : "r" (target), "r" (old_val), "r" (new_val) + : "p0"); + + return (int)(current_val == old_val); +} + +/**@ingroup func_qurt_atomic64_barrier + Allows compiler to enforce an ordering constraint on memory operation issued + before and after the function. + + @return + None. + + @dependencies + None. +*/ +static inline QURT_INLINE void +qurt_atomic64_barrier(void) +{ + /** @cond */ + __asm__ __volatile__ ( + "" + : + : + : + "memory"); + /** @endcond */ +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ATOMIC_OPS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_barrier.h new file mode 100755 index 0000000000000..7c6f787d43bc2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_barrier.h @@ -0,0 +1,140 @@ +#ifndef QURT_BARRIER_H +#define QURT_BARRIER_H + +/** + @file qurt_barrier.h + @brief Prototypes of Kernel barrier API functions. + + EXTERNALIZED FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup barrier_types +@{ */ +/*===================================================================== + Constants and macros +======================================================================*/ +#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */ +#define QURT_BARRIER_OTHER 0 /**< Other. */ + +#ifndef ASM +#include + +/*===================================================================== +Typedefs +======================================================================*/ + +/** QuRT barrier type. + */ +typedef union { + /** @cond */ + struct { + unsigned short threads_left; + unsigned short count; + unsigned int threads_total; + unsigned int queue; + unsigned int reserved; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_barrier_t; + +/** @} */ /* end_addtogroup barrier_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_init + Initializes a barrier object. + + @datatypes + #qurt_barrier_t + + @param[out] barrier Pointer to the barrier object to initialize. + @param[in] threads_total Total number of threads to synchronize on the barrier. + + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_destroy + Destroys the specified barrier. + + @note1hang Barriers must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Barriers must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to destroy. + + @return + Unused integer value. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_destroy(qurt_barrier_t *barrier); + +/*======================================================================*/ +/**@ingroup func_qurt_barrier_wait + Waits on the barrier.\n + Suspends the current thread on the specified barrier. \n + The function return value indicates whether the thread was the last one to + synchronize on the barrier. + When a thread waits on a barrier, it is suspended on the barrier: \n + - If the total number of threads waiting on the barrier is less than the assigned value + of the barrier, no other action occurs. \n + - If the total number of threads waiting on the barrier equals the assigned value of the + barrier, all threads currently waiting on the barrier are awakened, allowing them to + execute past the barrier. + + @note1hang After its waiting threads are awakened, a barrier is automatically reset + and can be used again in the program without the need for re-initialization. + + @datatypes + #qurt_barrier_t + + @param[in] barrier Pointer to the barrier object to wait on. + + @return + #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n + #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier. + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_barrier_wait(qurt_barrier_t *barrier); + + +#endif + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BARRIER_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_busywait.h new file mode 100755 index 0000000000000..a4dab80a2520a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_busywait.h @@ -0,0 +1,62 @@ +#ifndef QURT_BUSYWAIT_H +#define QURT_BUSYWAIT_H + +/** + @file qurt_busywait.h + @brief Implementation of the busywait() function for + hardware based blocking waits that use the QTIMER as a reference. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ============================================================================*/ +/*============================================================================= + * + * EDIT HISTORY FOR FILE + * + * This section contains comments describing changes made to the + * module. Changes are listed in reverse chronological + * order. + * + * + * when who what, where, why + * ---------- --- ------------------------------------------------------- + * 2018-03-20 pg Add Header file + ============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_busywait + Pauses the execution of a thread for a specified time.\n + Use for small microsecond delays. + + @note1hang The function does not return to the caller until + the time duration has expired. + + @param[in] pause_time_us Time to pause in microseconds. + + @return + None. + + @dependencies + None. + */ +void qurt_busywait (unsigned int pause_time_us); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_BUSYWAIT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_callback.h new file mode 100755 index 0000000000000..dc9b896c63454 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_callback.h @@ -0,0 +1,235 @@ +#ifndef QURT_CALLBACK_H +#define QURT_CALLBACK_H + +/** + @file qurt_callback.h + Definitions, macros, and prototypes for QuRT callback framework. + + QDI framework allows the development of root process drivers and services that + a user process client can interact with in a secure manner. QDI framework does + this by elevating the priviledge of user process thread, temporarily allowing + the thread execute in root context and letting it fall back to user context once + the QDI invocation is finished. + + The QuRT callback framework provides a safe mechanism for root process drivers + to execute callback functions in a user process. The framework hosts + dedicated worker threads in corresponding processes that handle the execution + of the callback function. This ensures that the callbacks occur in context of + the appropriate process thread, in result maintaining privilege boundaries. + + Prerequisites for use of this framework are: + 1. Driver is a QDI driver and client communicates with drivers using QDI + invocations. + 2. Appropriate callback configuration is specified in cust_config.xml for + the user process that intends to use this framework. + + qurt_cb_data_t is the public data structure that allows client to store all + the required information about the callback, including the callback function + and the arguments to pass to this function when it executes. + The client uses QDI interface to register this structure with root driver. + + Callback framework provides following APIs that a root driver can use to invoke callback. + These functions are described in qurt_qdi_driver.h header file. + + qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the + invoking thread does not wait for the callback to finish executing. + + qurt_qdi_cb_invoke_sync() triggers a synchronous callback. Upon invocation + the invoking thread gets suspended till the callback function finishes execution. + + qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to + qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with + the callback invocation to be utlized during the callback execution. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int qurt_cb_result_t; + +/* Callback framework error codes. + Callback framework returns a nonzero value if callback invocation is unsuccessful. + Following macros highlight cause of failure in more detail. +*/ +#define QURT_CB_ERROR -1 /* Callback registration failed.\n*/ +#define QURT_CB_OK 0 /* Success.\n*/ +#define QURT_CB_MALLOC_FAILED -2 /* QuRTOS malloc failure.\n*/ +#define QURT_CB_WAIT_CANCEL -3 /* Process exit cancelled wait operation.\n*/ +#define QURT_CB_CONFIG_NOT_FOUND -4 /* Callback configuration for process was not found.\n*/ +#define QURT_CB_QUEUE_FULL -5 /* Callback queue is serving at maximum capacity.*/ +/** @addtogroup cb_types +@{ */ +/** Callback registration data structure. + This data structure is used by a client attempting to register a callback with a QDI driver. + It holds the address of callback function and the argument supplied to the callback + function when it executes. +*/ +typedef struct { + /** @cond */ + void* cb_func; /*< Pointer to the callback function. */ + unsigned cb_arg; /*< Not interpreted by the framework.*/ + /** @endcond */ +} qurt_cb_data_t; + +/** @cond */ +/* Defines used as default if cust_config does not specify them. */ +#define CALLBACK_WORKER_STACK_SIZE 0x2000 +/** @endcond */ +/** @} */ /* end_addtogroup cb_typess */ +/**@ingroup func_qurt_cb_data_init + Initializes the callback data structure. + Entity registering a callback with the root process driver must call this function + to initialize callback registration data structure to the default value. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){ + cb_data->cb_func = NULL; + cb_data->cb_arg = 0; +} + +/**@ingroup func_qurt_cb_data_set_cbfunc + Sets up the callback function in the callback registration data structure. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_func Pointer to the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){ + cb_data->cb_func = cb_func; +} + +/**@ingroup func_qurt_cb_data_set_cbarg + Sets up the callback argument. + This function sets up the argument passed to the callback function when it executes. + + @datatypes + #qurt_cb_data_t + + @param[in] cb_data Pointer to the callback data structure. + @param[in] cb_arg Argument for the callback function. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){ + cb_data->cb_arg = cb_arg; +} + +/** @cond */ +/**@ingroup driver_support_functions + Invokes an asynchronous callback for a specified process. + A driver that resides in the root process calls this API to launch a callback in + a process described by the client_handle. + After the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is not suspended during the callback execution period. + The API returns immediately with a success/failure error code. + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process. + A driver that resides in a root process calls this API to launch a sync callback in + a process described by the client_handle. + AFter the callback is invoked, the framework queues the callback as per its + priority and subsequently executes it. + The caller of this function is suspended during the callback execution period. + If the process in which to execute the callback exits or terminates, the caller is + woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h). + + @note1hang This function is only accessible to drivers in the root process. + User process invocations shall fail with a negative error code return value. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, callback frameowrk + executes the callback at the priority of the API caller. + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle, + qurt_cb_data_t* cb_data, + int prio); + +/**@ingroup driver_support_functions + Invokes a synchronous callback for a specified process, passing driver data to the user PD. + This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to + the user process as part of the callback invocation. + + @param client_handle Obtained from the current invocation function (Section 4.3.1). + @param cb_data Pointer to the callback data structure (refer to qurt_callback.h). + @param prio Priority at which the callback should execute. + This paraemter is optional. If -1 is passed, the callback frameowrk + executes the callback at the priority of the API caller. + @param data Driver arbitrary data to pass to the user process. Memory pointed to by data + must be accessible to the user PD. The root driver can allocate such memory by + using qurt_mem_mmap(). + @param data_len Driver arbitrary data length. + + @return + QURT_EOK -- Callback was successfully communicated to the framework. + Negative error code -- Callback cannot be communicated to the framework. + */ +qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle, + qurt_cb_data_t* cb_data, + int prio, + void *data, + unsigned data_len + ); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_clade.h new file mode 100755 index 0000000000000..d7442cf98dd94 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_clade.h @@ -0,0 +1,62 @@ +#ifndef QURT_CLADE_H +#define QURT_CLADE_H +/** + @file qurt_clade.h + @brief Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API. + CLADE is a cache line level memory compression system that is used to + decrease DRAM usage. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_clade2_get + Reads the value of the clade2 register. + + @param[in] offset Offset from the clade2 cfg base. + @param[out] *value Pointer to the register value read from the offset. + + @return + #QURT_EOK - Successfully read the value from the register at offset \n + #QURT_EINVALID - Offset passed is incorrect + + @dependencies + None. + */ +int qurt_clade2_get(unsigned short offset, unsigned int *value); + +/**@ingroup func_qurt_clade2_set + Sets the PMU register; only PMU_SEL register can be set. + + @param[in] offset Offset from the QURTK_clade2_cfg_base. + @param[in] value Value to set at offset. + + @return + #QURT_EOK -- Successfully set the value at offset. \n + #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG. + + @dependencies + None. + */ +int qurt_clade2_set(unsigned short offset, unsigned int value); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CLADE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cond.h new file mode 100755 index 0000000000000..6e65ed82a8393 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cond.h @@ -0,0 +1,219 @@ +#ifndef QURT_COND_H +#define QURT_COND_H +/** + @file qurt_cond.h + @brief Prototypes of kernel condition variable object API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup condition_variables_types +@{ */ +/*===================================================================== + Typedefs + ======================================================================*/ + +/** QuRT condition variable type. */ +typedef union { + /** @cond */ + unsigned long long raw; + struct { + unsigned int count; + unsigned int n_waiting; + unsigned int queue; + unsigned int reserved; + }X; + /** @endcond */ +} qurt_cond_t; + +/** @} */ /* end_addtogroup condition_variables_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_cond_init + Initializes a conditional variable object. + + @datatypes + #qurt_cond_t + + @param[out] cond Pointer to the initialized condition variable object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_init(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_destroy + Destroys the specified condition variable. + + @note1hang Conditions must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Conditions must not be destroyed while they are still in use. If this occurs, + the behavior of QuRT is undefined. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to destroy. + + @return + None. + + */ +/* ======================================================================*/ +void qurt_cond_destroy(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_signal + Signals a waiting thread that the specified condition is true. \n + + When a thread wishes to signal that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the signal condition operation. \n + -# Unlock the mutex. + + @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_signal(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_broadcast + Signals multiple waiting threads that the specified condition is true.\n + When a thread wishes to broadcast that a condition is true on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# Perform the broadcast condition operation. \n + -# Unlock the mutex.\n + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t + + @param[in] cond Pointer to the condition variable object to signal. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_broadcast(qurt_cond_t *cond); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable (suspends the thread and unlocks the mutex). + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @datatypes + #qurt_cond_t \n + #qurt_mutex_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex); + +/*======================================================================*/ +/**@ingroup func_qurt_cond_wait2 + Suspends the current thread until the specified condition is true. + When a thread wishes to wait for a specific condition on a shared data item, it must + perform the following procedure: \n + -# Lock the mutex that controls access to the data item. \n + -# If the condition is not satisfied, perform the wait condition operation on the + condition variable, which suspends the thread and unlocks the mutex. + + @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause + the threads to never be suspended (or suspended but never awakened). + + @note1cont Use condition variables only with regular mutexes -- attempting to use + recursive mutexes or priority inheritance mutexes results in undefined behavior. + + @note1cont This is the same API as qurt_cond_wait(), use this version + when using mutexes of type #qurt_rmutex2_t. + + @datatypes + #qurt_cond_t \n + #qurt_rmutex2_t + + @param[in] cond Pointer to the condition variable object to wait on. + @param[in] mutex Pointer to the mutex associated with the condition variable to wait on. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_COND_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_consts.h new file mode 100755 index 0000000000000..b1e35998e73b6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_consts.h @@ -0,0 +1,315 @@ +#ifndef QURT_CONSTS_H +#define QURT_CONSTS_H + +/** + @file qurt_consts.h + @brief QuRT constants and definitions + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* Definitions of system events. System events suspend + a thread and put it into suspending_list. + The system event number is saved in CONTEXT::error::cause field + of the suspended thread. An event handler thread such as + page fault handler or system error handler can wake up the suspended + thread. + */ +#define QURT_EVENT_PAGEFAULT 0x1 /* Page fault event. */ +#define QURT_EVENT_SYSTEM_ERR 0x2 /* System error event. */ +#define QURT_EVENT_SUSPEND 0x3 +#define QURT_EVENT_PROCESS_EXIT 0x4 /* Process termination event.*/ + +#define QURT_SYSENV_MAX_THREADS_TYPE 1 /* Maximum threads object. */ +#define QURT_SYSENV_PROCNAME_TYPE 2 /* Process name object. */ +#define QURT_SYSENV_MAX_PI_PRIO_TYPE 3 /* Maximum pi priority object. */ +#define QURT_SYSENV_ARCH_REV_TYPE 4 /* Architecture version object. */ +#define QURT_SYSENV_APP_HEAP_TYPE 5 /* Application heap object. */ +#define QURT_SYSENV_REGION_ATTR_DEFAULT 7 /* Default region attributes. */ +#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE 8 /* Stack profile count type. */ +#define QURT_SYSENV_ISLAND_CONFIG_TYPE 9 /*island configuration check*/ +#define QURT_SYSENV_HTHREADS_TYPE 10 /* Active threads objec */ +#define QURT_SYSENV_CONFIG_IMAGE_START_LO 11 /* Config image start address for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_START_HI 12 /* Config Image start address for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_LO 13 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_CHIPPARAMS_HI 14 /* ChipParams for DTB parsing */ +#define QURT_SYSENV_PLATPARAMS 15 /* Platformparams for DTB parsing */ +#define QURT_SYSENV_CONFIG_IMAGE_SIZE 16 /* Config image Size for DTB parsing */ +#define QURT_SYSENV_L2_CACHE_LINE_SIZE 17 /*L2 cache line size*/ + +/* Get q6 regs */ +#define QURT_GET_SSR 1 +#define QURT_GET_CCR 2 +#define QURT_GET_CFGBASE 3 +#define QURT_GET_SYSCFG 4 +#define QURT_GET_REV 5 + + +/** @cond rest_reg_dist */ +/** @addtogroup performance_monitor_macros +@{ */ + +/* PMU */ +#define QURT_PMUCNT0 0 /**< */ +#define QURT_PMUCNT1 1 /**< */ +#define QURT_PMUCNT2 2 /**< */ +#define QURT_PMUCNT3 3 /**< */ +#define QURT_PMUCFG 4 /**< */ +#define QURT_PMUEVTCFG 5 /**< */ + +/* new since V55 */ +#define QURT_PMUCNT4 6 /**< */ +#define QURT_PMUCNT5 7 /**< */ +#define QURT_PMUCNT6 8 /**< */ +#define QURT_PMUCNT7 9 /**< */ +#define QURT_PMUEVTCFG1 10 /**< */ + +/* new since V61 */ +#define QURT_PMUSTID0 11 /**< */ +#define QURT_PMUSTID1 12 /**< */ + +#define QURT_PMUCNTSTID0 13 /**< */ +#define QURT_PMUCNTSTID1 14 /**< */ +#define QURT_PMUCNTSTID2 15 /**< */ +#define QURT_PMUCNTSTID3 16 /**< */ +#define QURT_PMUCNTSTID4 17 /**< */ +#define QURT_PMUCNTSTID5 18 /**< */ +#define QURT_PMUCNTSTID6 19 /**< */ +#define QURT_PMUCNTSTID7 20 /**< */ + +/** @} */ /* end_addtogroup performance_monitor_macros */ +/** @endcond */ + +/* + Power collapse operation +*/ +#define QURT_POWER_SHUTDOWN 0 /**< */ +#define QURT_TCXO_SHUTDOWN 1 /**< */ +#define QURT_POWER_CMD_PREPARE 0 /**< */ +#define QURT_POWER_CMD_PERFORM 1 /**< */ +#define QURT_POWER_CMD_EXIT 2 /**< */ +#define QURT_POWER_CMD_FAIL_EXIT 3 /**< */ +#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */ +#define QURT_POWER_CMD_PERFORM_SAVE_TCM 5 /**< */ +#define QURT_POWER_CMD_DEEP_SLEEP 6 /**< */ + + +/** @addtogroup thread_macros +@{ */ +#define QURT_MAX_HTHREAD_LIMIT 8U /**< Limit on the maximum number of hardware threads supported by QuRT for any + Hexagon version. Use this definition to define arrays, and so on, in + target independent code. */ +/** @} */ /* end_addtogroup thread_macros */ + +/** @cond internal_only */ +/** @addtogroup power_management_macros +@{ */ +/** + L2 cache retention mode +*/ +#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_L2RET QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */ +#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */ +/** @} */ /* end_addtogroup power_management_macros */ +/** @endcond */ + +/* + QURT_system_state + Use for debugging the shutdown/startup process. + + State transition for cold boot: + QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT --> + QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE --> + QURT_CBOOT_ROOT_TASK_STARTED + + State transition for power collapse: + QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND --> + QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC --> + cache flush states (dependent on L2 retention config) + + State transition for warm boot: + QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB --> + QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP --> + QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT +*/ +#define QURT_PREPARE_SINGLE_MODE 1 /**< */ +#define QURT_PREPARE_END 2 /**< */ +#define QURT_PERFORM_IPEND 3 /**< */ +#define QURT_PERFORM_SAVE_ISDP 4 /**< */ +#define QURT_PERFORM_SAVE_PMU 5 /**< */ +#define QURT_PERFORM_SAVE_TLB 6 /**< */ +#define QURT_PERFORM_SWITCH_PC 7 /**< */ +#define QURT_PERFORM_EXIT 8 /**< */ +#define QURT_FLUSH_L1CACHE 9 /**< */ +#define QURT_FLUSH_L2CACHE 0xA /**< */ +#define QURT_FLUSH_CACHE_DONE 0xB /**< */ +#define QURT_SWITCH_PC_DONE 0xC /**< */ +#define QURT_BOOT_SETUP_ISDB 0xD /**< */ +#define QURT_WBOOT_INIT_TLB 0xE /**< */ +#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */ +#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */ +#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */ +#define QURT_CBOOT_BSP_INIT 0x12 /**< */ +#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */ +#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */ +#define QURT_CBOOT_END_OS_INIT 0x15 /**< */ +#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */ +#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */ +#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */ +#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */ +#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */ +#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */ +#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */ +#define QURT_WBOOT_DEBUG_L2_END 0x1D /**< */ +#define QURT_NMI_SAVE_L2VIC_COMPLETE 0x1E /**< */ +#define QURT_NMI_HANDLER_COMPLETE 0x1F /**< */ +#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */ +#define QURT_WBOOT_START 0x21 /**< */ +#define QURT_ENTER_ISLAND 0x22 /**< */ +#define QURT_EXIT_ISLAND 0x23 /**< */ +#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */ +#define QURT_ABNORMAL_RESET 0x25 /**< */ +/* + Thread attributes +*/ + +#define QURT_THREAD_ATTR_GP 0x00000002 /*< */ +#define QURT_THREAD_ATTR_UGP 0x00000003 /*< User general pointer (UGP)*/ +#define QURT_THREAD_ATTR_PREFETCH 0x00000004 /*< */ +#define QURT_THREAD_ATTR_TID 0x00000005 /*< */ +#define QURT_THREAD_ATTR_CACHE_PART 0x00000007 /*< */ +#define QURT_THREAD_ATTR_COPROCESSOR 0x00000008 /*< */ +#define QURT_THREAD_ATTR_GET_L2CACHE_PART 0x00000009 /*< */ +#define QURT_THREAD_ATTR_SET_FRML 0x0000000A /*< */ +#define QURT_THREAD_ATTR_STID_GET 0x0000000B /*< */ +#define QURT_THREAD_ATTR_STID_SET 0x0000000C /*< */ +#define QURT_THREAD_ATTR_AUTOSTACK 0x0000000D /*< */ +#define QURT_THREAD_ATTR_SYSTEM_THREAD 0x0000000E /*< */ +#define QURT_THREAD_ATTR_STID_SET2 0x0000000F /*< */ +#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */ +#define QURT_THREAD_ATTR_STID_GET2 0x00000011 /*< */ + +/** Cache operations*/ +#define QURT_DCCLEAN 0U /* Clean Dcache. */ +#define QURT_DCINV 1U /* Invalidate Dcache. */ +#define QURT_DCCLEANINV 2U /* Clean and invalidate Dcache. */ +#define QURT_ICINV 3U /* Invalidate Icache. */ +#define QURT_DUMP_DCTAGS 4U /* For testing purpose. */ +#define QURT_FLUSH_ALL 5U /* Flush entire L1 and L2 cache. */ +#define QURT_TABLE_FLUSH 6U /* Flush based on table of physical pages */ +#define QURT_CLEAN_INVALIDATE_ALL 7U /* Flush and invalidate entire L1 and L2 cache. */ +#define QURT_L2CACHE_LOCK_LINES 8U /* l2 cache lock lines */ +#define QURT_L2CACHE_UNLOCK_LINES 9U /* l2 cache unlock lines */ +#define QURT_CLEAN 10U /* Flush L1 and L2 cache */ +#define QURT_CLEAN_INVALIDATE 11U /* Flush and invalidate L1 and L2 cache. */ +#define QURT_CLEAN_INVALIDATE_L2 12U /* Flush and invalidate entire L2 cache. */ + +/**@ingroup chapter_prefined_symbols */ +/**@xreflabel{hdr:QURT_API_VERSION}*/ + + +/* Process state. */ +#define QURT_UPDATE_PROCESS_STATE 0 /**< */ +#define QURT_MP_INIT 1 /*< */ +#define QURT_MP_RUNNING 2 /*< */ +#define QURT_MP_STOPPED 3 /*< */ + +/* QuRT reset reason. */ +#define QURT_NORMAL_BOOT 0 /* Normal boot. */ +#define QURT_WARM_BOOT 1 /* Power collapse warm boot. */ +#define QURT_WARM_BOOT_L2_RETENTION 2 /* Power collapse with L2 retention warm boot. */ +#define QURT_WARM_BOOT_SAVE_TCM 3 /* Power collapse with saving TCM. */ +#define QURT_QUICK_BOOT 4 /* Deep sleep. */ + +/* QuRT Wait for Idle command */ +#define QURT_WAIT_FOR_IDLE_DISABLE 0 /*< */ +#define QURT_WAIT_FOR_IDLE_ENABLE 1 /*< */ +#define QURT_WAIT_FOR_IDLE 2 /*< */ +#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */ + +/*QuRT island exit stages */ +#define QURT_ISLAND_EXIT_STAGE1 1 /*< */ +#define QURT_ISLAND_EXIT_STAGE2 2 /*< */ + +#define QURT_MAX_NAME_LEN 64 /*< */ + +#define MAX_POOL_RANGES 16 /*< */ + +/* key definitions for debug thread info */ +//#define MAX_TCB_KEY 40 //whatever is a good number or makes debug thread structure be 1K +#define KEY_SCHDULER_STATE 1 /*< */ +#define KEY_PRIORITY 2 /*< */ +#define KEY_PRIORITY_ORIG 3 /*< */ +#define KEY_STACK_BOTTOM 4 // Currently not populated +#define KEY_STACK_TOP 5 // Currently not populated +#define KEY_HVX_STATE 6 /*< */ +#define KEY_FUTEX_OBJECT 7 /*< */ +#define KEY_THREAD_ID 8 /*< */ +#define KEY_PROFILE_CYCLE_LO 9 // Currently not populated +#define KEY_PROFILE_CYCLE_HI 10 // Currently not populated +#define KEY_ERROR_ADDRESS 11 // This holds the BADVA +#define KEY_ERROR_CAUSE 12 // This is the same as QURT_error_info.cause +#define KEY_ERROR_CAUSE2 13 // This is the same as QURT_error_info.cause2 +#define KEY_ERROR_SSR 14 /*< Holds the SSR value */ +#define QURT_RESERVED -1 + +/* VTLB method IDs. */ +#define QURT_VTLB_ENTRY_CREATE 0U +#define QURT_VTLB_ENTRY_DELETE 1U +#define QURT_VTLB_ENTRY_READ 2U +#define QURT_VTLB_ENTRY_WRITE 3U +#define QURT_VTLB_ENTRY_PROBE 4U +#define QURT_VTLB_ENTRY_SPLIT 5U +#define QURT_VTLB_ENTRY_MERGE 6U +#define QURT_VTLB_ENTRY_STATISTICS 7U +#define QURT_VTLB_ENTRY_SET_SPECIAL 8U +#define QURT_VTLB_QUEUE_PPAGE 9U +#define QURT_VTLB_RECLAIM_STACK_PAGES 10U +#define QURT_VTLB_ASID_SET_STATE_FAST 11U +#define QURT_VTLB_ASID_SET_STATE 12U +#define QURT_VTLB_ENTRY_SET_EXTENSION 13U +#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U + +/* VTCM window access control HWIO programming. */ +#define QURT_VTCM_WINDOW_ENABLE 1U +#define QURT_VTCM_WINDOW_DISABLE 0U +#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT 0xFFFU +#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT 0U + +/** @cond */ +/* ETM source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< Memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< Memory source of SAC* is data. */ + +/* ETM PID status flags */ +#define QURT_ETM_NO_PID 0xFFFFFFFF /**< No PID is selected. */ +/** @endcond */ + +/* execution context */ +#define QURT_CTX_USER 1 +#define QURT_CTX_GUEST 2 + +/* Profiling STID */ +#define QURT_STID_DEFAULT 0U + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cycles.h new file mode 100755 index 0000000000000..b599493f5d563 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cycles.h @@ -0,0 +1,301 @@ + +#ifndef QURT_CYCLES_H +#define QURT_CYCLES_H 1 +/** + @file qurt_cycles.h + Prototypes of kernel pcycle API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ + +/**@ingroup func_qurt_profile_reset_idle_pcycles + @xreflabel{hdr:qurt_profile_reset_idle_pcycles} + Sets the per-hardware-thread idle cycle counts to zero. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_idle_pcycles (void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_thread_pcycles + @xreflabel{hdr:qurt_profile_get_thread_pcycles} + Gets the count of the running processor cycles for the current thread.\n + Returns the current running processor cycle count for the current QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @return + Integer -- Running processor cycle count for current thread. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_profile_get_thread_pcycles(void); + + +/*======================================================================*/ +/**@ingroup func_qurt_get_core_pcycles + @xreflabel{hdr:qurt_get_core_pcycles} + Gets the count of core processor cycles executed.\n + Returns the current number of running processor cycles executed since the Hexagon + processor was last reset. + + This value is based on the hardware core clock, which varies in speed according to the + processor clock frequency. + + @note1hang Because the hardware core clock stops running when the processor shuts + down (due to all of the hardware threads being idle), treat the cycle values returned + by this operation as relative rather than absolute. + + @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version. + + @return + Integer -- Current count of core processor cycles. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long int qurt_get_core_pcycles(void); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles + + @deprecated use #qurt_profile_get_idle_pcycles2 instead + + Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use + #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. + + This operation accepts a pointer to a user-defined array, and writes to the array the current + idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling is enabled or not, + and resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be a minimum of the number of hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_idle_pcycles (unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_idle_pcycles2 + Gets the current idle processor cycle counts for maximum available hardware threads. + + This operation accepts a pointer to a user-defined array with length in bytes, and writes + to the array the current idle cycle count for each hardware thread. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been in Wait mode.\n + + @note1hang This operation does not return the idle cycles that occur when the Hexagon + processor shuts down (due to all of the hardware threads being idle). + Idle cycle counts gets accumulated irrespective of profiling enable status, and + resets on #qurt_profile_reset_idle_pcycles + + @param[out] pcycles User array where the function stores the current idle cycle count values. + Array size should be equivalent to the number of hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, + it returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles + + @deprecated use #qurt_profile_get_threadid_pcycles2 instead + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for a maximum of 6 hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Valid thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be a minimum of the number of + hardware threads intended. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long *pcycles); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_get_threadid_pcycles2 + + Gets the current per-hardware-thread running cycle counts for the specified QuRT + thread for maximum available hardware threads. + + Each count value represents the number of processor cycles that have elapsed on the + corresponding hardware thread while that thread has been scheduled for the specified + QuRT thread. + + @note1hang Profiling shall be enabled first to start the cycle counting. + The cycles are accumulated once the profiling is enabled and + resets on #qurt_profile_reset_threadid_pcycles + + @param[in] thread_id Thread identifier. + @param[out] pcycles Pointer to a user array where the function stores the current running + cycle count values. Array size should be equivalent to the number of + hardware threads intended. + Call #qurt_sysenv_get_max_hw_threads to determine the array size required. + @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller + than the required for the maximum available hardware threads, it + returns error code. + + @return + #QURT_EOK -- Successful operation. Stored all the data to the destination array + #QURT_EFAILED -- Operation failed due to smaller #pcycles array + #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long *pcycles, unsigned int length_in_bytes); + + +/*======================================================================*/ +/**@ingroup func_qurt_profile_reset_threadid_pcycles + @xreflabel{hdr:qurt_profile_reset_threadid_pcycles} + Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread. + + @param[in] thread_id Thread identifier. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_reset_threadid_pcycles (int thread_id); + +/*======================================================================*/ +/**@ingroup func_qurt_profile_enable + @xreflabel{hdr:qurt_profile_enable} + Enables profiling.\n + Enables or disables cycle counting of the running and idle processor cycles. + Profiling is disabled by default. \n + + @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be + done explicitly by calling the reset operations before starting cycle counting. + Cycle counting starts from the instant of it was enabled using this API, and + halts on profiling disable. + + @param[in] enable Profiling. Values: \n + - 0 -- Disable profiling \n + - 1 -- Enable profiling @tablebulletend + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_profile_enable (int enable); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_pcycles + @xreflabel{hdr:qurt_get_hthread_pcycles} + Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values are 1 through . + + + @return + Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed + from reset to current point of execution when n threads are in run mode + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_pcycles(int n); + +/*======================================================================*/ +/**@ingroup func_qurt_get_hthread_commits + @xreflabel{hdr:qurt_get_hthread_commits} + Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n + + @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id. + + @param[in] n Threads in run mode. Valid values: 1 through . + + @return + Value read from the GCOMMIT_nT register. This value indicates the total number of packets + committed from reset to current point of execution when n threads are in run mode. + + @dependencies + PMU must be enabled. +*/ +/* ======================================================================*/ +unsigned int qurt_get_hthread_commits(int n); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_devtree.h new file mode 100755 index 0000000000000..4adee45bb44a2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_devtree.h @@ -0,0 +1,161 @@ +#ifndef QURT_DEVTREE_H +#define QURT_DEVTREE_H +/** + @file qurt_devtree.h + @brief Prototypes and structures for device tree aware QuRT library function. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +*/ +/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def. + callback is not used here, so define NULL here to avoid including the world*/ +#ifndef NULL +#define NULL ((void *) 0) +#endif + +#include "libfdt.h" +#include "DTBExtnLib.h" +#include "qurt_qdi_ext.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_BLOB_ID (-1) +#define DEFAULT_BLOB_ID 0 + +/** QURT Device Tree Mapping Macros */ +#define QURT_DT_MAPPING_FAILED (-1) +#define QURT_DT_FLAG_ISLAND 0x1 +#define QURT_DT_FLAG_PHYSADDR 0x2 + +/** Device Tree type for Root PD Device tree. +    Root PD Device Tree will typically describe the hardware in the subsystem. +    This is the /soc portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_ROOT 0 + +/** Device Tree type for Local Device tree. +    Local Device Tree will typically contain the software settings. +    This is the /sw portion of the Device Tree. */ +#define QURT_DT_BLOB_TYPE_LOCAL 1 + +int qurt_devtree_init(void); + +/**@ingroup func_qurt_dt_mapping_create + Creates a memory mapping from the specified property of the specified device + tree node. Returns virtual addresses and sizes. + + @param[in] offset Device tree node offset. + @param[in] flags Flags to configure memory. Overloaded as property + index if reg_name is NULL. + @param[in] reg_name Identifies property to use for mapping, should + resemble a region. + @param[out] vaddr Return pointer for the virtual region address. + @param[out] size Return pointer for the virtual region size. + + @return + Result code indicating success or failure \n +*/ +int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, + unsigned long long *vaddr, unsigned long long *size); + +/**@ingroup func_qurt_dt_mapping_create2 + + Creates a memory mapping from the specified property of the specified device + tree node. + + Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). + + @param[in] devtreeNode Device Tree node + + @param[in] dt_map_flags Flags to configure memory mapping and are reserved for future purpose. + (0) - Default value assumes details from DT node are phys address, size. + QURT_DT_FLAG_ISLAND + + NOTE: The PA needs to be added to corresponding island spec to create an island mapping + + @param[in] regionName NULL or name of index in range to return, should + resemble a region. Ex.reg-names = "base", "rx", "tx"; + + @param[in] regionIdx Index of range to return. Ex reg = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >; + + NOTE: If client specifies both re_name & regionIdx. The precedence of + region name is taken over and region index is ignored. + + @param[in] dt_map_perm Mapping access permissions(R/W), + QURT_PERM_READ + QURT_PERM_WRITE + + @param[in] cache_attr QuRT cache mode type's : + QURT_MEM_CACHE_DEVICE + QURT_MEM_CACHE_WRITEBACK + Other required cache type enums in qurt_types.h can also be passed. + + NOTE: No default value for cache & perm is present. + Client always needs to pass any of defined the flags. + + @param[out] vaddr Return pointer to the variable that holds the virtual address + @param[out] size Return pointer for the virtual region size. + + @return + #QURT_EOK Success indicating mapping created properly. + #QURT_DT_MAPPING_FAILED Failed to create mapping. + #QURT_EINVALID Mismatch in the architecture. + + else FdtLib or thirdparty error code. + +*/ +int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, + char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size); + +/**@ingroup func_qurt_dt_isr_register + Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. + The interrupt defined in the specified device tree node is enabled when this function returns success. + + @datatypes + #qurt_thread_t \n + #fdt_node_handle + + @param[in] dt_node Device tree node that specifies the interrupt property. + @param[in] dt_int_index Index of the specific interrupt to use within the device tree node structure. + Specify either this or int_name, use -1 if string is used. + @param[in] dt_int_name Name of the specific interrupt to use within the device tree node structure. + Either this or int_index should be specified, use NULL if index is used + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2(). + @param[in] prio Priority of the ISR, defined by qurt_isr_register2(). + @param[in] flags Defines ACK type. Values : \n + #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the kernel. + #QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + Defined by qurt_isr_register2(). + @param[in] isr ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2(). + @param[in] arg First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2(). + + @return + #QURT_EOK -- Successfully registered the ISR for the interrupt \n + #QURT_EINT -- Interrupt not configured \n + #QURT_EINVALID -- Invalid thread ID \n + #QURT_EDISABLED -- The feature is disabled \n + #QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Create the thread ID qurt_isr_create(). + ISR registration completed with qurt_isr_register2(). + */ +int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, + unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_dt_blob_id_get + Returns the Blob ID for the Blob type passed. + The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs. + + @param[in] blob_type  Blob type to look up. + @return Blob ID for the passed Blob Type. +*/ +int qurt_dt_blob_id_get(unsigned int blob_type); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ecc.h new file mode 100755 index 0000000000000..09312684e99af --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ecc.h @@ -0,0 +1,168 @@ +#ifndef QURT_ECC_H +#define QURT_ECC_H + + +/*===================================================================== + + @file qurt_ecc.h + @brief Prototypes of QuRT memory ECC API functions + + Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup exception_handling_types +@{ */ +// ECC memory definition +typedef enum { + QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */ + QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/ + QURT_ECC_MEM_L2_CACHE = 2, /**< ECC memory L2 Cache.*/ + QURT_ECC_MEM_VTCM = 3 /**< ECC memory VTCM.*/ +} qurt_ecc_memory_t; +/** @} */ /* end_addtogroup exception_handling_types */ + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup exception_handling_macros +@{ */ + +#define QURT_ECC_ERR_DETECTED_STATUS 0 /**< ECC error detected. */ +#define QURT_ECC_ERR_TYPE 1 /**< ECC error type.*/ +// ECC status type + +#define QURT_ECC_CORRECTABLE_COUNT (1<<0) /**< ECC correctable count.*/ +#define QURT_ECC_UNCORRECTABLE_COUNT (1<<1) /**< ECC uncorrectable count.*/ +#define QURT_ECC_REGION_LOGGING (1<<2) /**< ECC region logging.*/ +// ECC enable/disable definition + +#define QURT_ECC_PROTECTION_DISABLE (0<<0) /**< Bit 0. */ +#define QURT_ECC_PROTECTION_ENABLE (1<<0) /**< Bit 0. */ +/** @} */ /* end_addtogroup exception_handling_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_ecc_enable + Enables or disables ECC protection on a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] enable Set to one of the following values: + - #QURT_ECC_PROTECTION_ENABLE + - #QURT_ECC_PROTECTION_DISABLE @tablebulletend + + @return + - #QURT_EOK -- ECC enabling or disabling setup is performed successfully + - Others -- Failure + + @dependencies + None. + */ +int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable ); + + +/**@ingroup func_qurt_ecc_get_error_status + Gets ECC error status for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following: + - #QURT_ECC_MEM_L1_ICACHE + - #QURT_ECC_MEM_L1_DCACHE + - #QURT_ECC_MEM_L2_CACHE + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following: + - #QURT_ECC_ERR_DETECTED_STATUS + - #QURT_ECC_ERR_TYPE @tablebulletend + + @return + Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS: + - 0 -- No error detected \n + - 1 -- At least one error detected \n + Returns the following when the type is #QURT_ECC_ERR_TYPE: \n + - 0 through 1 -- Correctable error \n + - 2 -- Uncorrectable error + + @dependencies + None. + */ +int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_get_error_count + Gets the ECC error count for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values:\n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT @tablebulletend + + @return + Error count for the specified error type. + + @dependencies + None. + */ +int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type ); + + +/**@ingroup func_qurt_ecc_clear_error_count + Clears ECC error count or region logging for a specified memory. + + @datatypes + #qurt_ecc_memory_t + + @param[in] memory Set to one of the following values: \n + - #QURT_ECC_MEM_L1_ICACHE \n + - #QURT_ECC_MEM_L1_DCACHE \n + - #QURT_ECC_MEM_L2_CACHE \n + - #QURT_ECC_MEM_VTCM @tablebulletend + + @param[in] type Set to one or multiple OR'ed of the following values: \n + - #QURT_ECC_CORRECTABLE_COUNT \n + - #QURT_ECC_UNCORRECTABLE_COUNT \n + - #QURT_ECC_REGION_LOGGING @tablebulletend + + @return + #QURT_EOK -- Error count successfully cleared \n + Others -- Failure at clearing the error count + + @dependencies + None. + */ +int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type ); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ECC_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_error.h new file mode 100755 index 0000000000000..f4666b396c378 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_error.h @@ -0,0 +1,149 @@ +#ifndef QURT_ERROR_H +#define QURT_ERROR_H + +/** + @file qurt_error.h + Error results- QURT defines a set of standard symbols for the error result values. This file lists the + symbols and their corresponding values. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021-2022 , 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ +#include "qurt_except.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup chapter_error +@{ */ + +/*===================================================================== +Constants and macros +======================================================================*/ +#define QURT_EOK 0 /**< Operation successfully performed. */ +#define QURT_EVAL 1 /**< Wrong values for the parameters. The specified page does not exist. */ +#define QURT_EMEM 2 /**< Not enough memory to perform the operation.*/ + +#define QURT_EINVALID 4 /**< Invalid argument value; invalid key. */ +/** @cond */ +#define QURT_EUNKNOWN 6 /**< Defined but never used in QuRT. */ +#define QURT_ENOMSGS 7 /**< Message queue is empty. */ +#define QURT_EBADF 9 /**< Bad message queue descriptor. */ +/** @endcond */ +#define QURT_EFAILED 12 /**< Operation failed. */ + +#define QURT_ENOTALLOWED 13 /**< Operation not allowed. */ + +/** @cond */ +#define QURT_EDUPCLSID 14 /*< Duplicate class ID. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOREGISTERED 20 /**< No registered interrupts.*/ +/** @endcond */ + + +/** @cond */ +#define QURT_EISDB 21 /*< Power collapse failed due to ISDB being enabled. */ +#define QURT_ESTM 22 /*< Power collapse failed in a Single-threaded mode check. */ +/** @endcond */ + + +/** @cond rest_reg_dist */ +#define QURT_ETLSAVAIL 23 /**< No free TLS key is available. */ +#define QURT_ETLSENTRY 24 /**< TLS key is not already free. */ +/** @endcond */ + +#define QURT_EINT 26 /**< Invalid interrupt number (not registered). */ +/** @cond rest_reg_dist */ +#define QURT_ESIG 27 /**< Invalid signal bitmask (cannot set more than one signal at a time). */ +/** @endcond */ + +/** @cond */ +#define QURT_EHEAP 28 /**< No heap space is available. */ +#define QURT_ENOSPC 28 /**< No space to create another queue in the system. */ +#define QURT_EMEMMAP 29 /**< Physical address layout is not supported by the kernel. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_ENOTHREAD 30 /**< Thread no longer exists. */ +/** @endcond */ +/** @cond */ +#define QURT_EL2CACHE 31 /**< L2cachable is not supported in kernel invalidate/cleaninv. */ +/** @endcond */ +/** @cond rest_reg_dist */ +#define QURT_EALIGN 32 /**< Not aligned. */ +#define QURT_EDEREGISTERED 33 /**< Interrupt is already deregistered.*/ +/** @endcond */ + +/** @cond internal_only */ + +#define QURT_ETLBCREATESIZE 34 /**< TLB create error -- Incorrect size.*/ +#define QURT_ETLBCREATEUNALIGNED 35 /**< TLB create error -- Unaligned address.*/ +/** @endcond */ +/** @cond rest_reg_dist*/ +#define QURT_EEXISTS 35 /**< File or message queue already exists. */ +#define QURT_ENAMETOOLONG 36 /**< Name too long for message queue creation. */ +#define QURT_EPRIVILEGE 36 /**< Caller does not have privilege for this operation.*/ + +#define QURT_ECANCEL 37 /**< A cancellable request was canceled because the associated process was asked to exit.*/ +/** @endcond */ + +/** @cond */ +#define QURT_EISLANDTRAP 38 /*< Unsupported TRAP is called in Island mode.*/ + +#define QURT_ERMUTEXUNLOCKNONHOLDER 39 /*< Rmutex unlock by a non-holder.*/ +#define QURT_ERMUTEXUNLOCKFATAL 40 /*< Rmutex unlock error, all except the non-holder error.*/ +#define QURT_EMUTEXUNLOCKNONHOLDER 41 /*< Mutex unlock by a non-holder.*/ +#define QURT_EMUTEXUNLOCKFATAL 42 /*< Mutex unlock error, all except the non-holder error.*/ +#define QURT_EINVALIDPOWERCOLLAPSE 43 /*< Invalid power collapse mode requested. */ +/** @endcond */ +#define QURT_EISLANDUSEREXIT 44 /**< User call has resulted in island exit.*/ +#define QURT_ENOISLANDENTRY 45 /**< Island mode had not yet been entered.*/ +#define QURT_EISLANDINVALIDINT 46 /**< Exited Island mode due to an invalid island interrupt.*/ +/** @cond rest_reg_dist */ +#define QURT_ETIMEDOUT 47 /**< Operation timed-out. */ +#define QURT_EALREADY 48 /**< Operation already in progress. */ +/** @endcond */ + +#define QURT_ERETRY 49 /*< Retry the operation. */ +#define QURT_EDISABLED 50 /*< Resource disabled. */ +#define QURT_EDUPLICATE 51 /*< Duplicate resource. */ +#define QURT_EBADR 53 /*< Invalid request descriptor. */ +#define QURT_ETLB 54 /*< Exceeded maximum allowed TLBs. */ +#define QURT_ENOTSUPPORTED 55 /*< Operation not supported. */ +/** @cond rest_reg_dist */ +#define QURT_ENORESOURCE 56 /**< No resource. */ +/** @endcond */ + +#define QURT_EDTINIT 57 /**< Problem with device tree intialization. */ +#define QURT_EBUFLOCK 58 /*< Buffer lock failed because it was already locked many times. */ +#define QURT_ELOCKED 59 /**< Current operation failed as the buffer is locked. */ +#define QURT_EMSGSIZE 90 /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */ + + +#define QURT_ENOTCONFIGURED 91 /*< Interrupt is NOT configured. */ + +#define QURT_EBANDWIDTHLIMIT 92 /*< Message queue send exceed the bandwidth limit. */ + +#define QURT_ECFIVIOLATION 93 /*< CFI violation detected. */ + +#define QURT_EDESTROY 94 /**< A destroy request was made to waiting threads.*/ + +#define QURT_EHMXNOTAVAIL 95 /**< HMX is not available to target thread.*/ +#define QURT_EHMXNOTDETACHABLE 96 /**< HMX is not detachable from target thread.*/ + +#define QURT_EFATAL -1 /**< Fatal error. */ + +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ERROR_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_event.h new file mode 100755 index 0000000000000..987f0fe79f227 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_event.h @@ -0,0 +1,452 @@ +#ifndef QURT_EVENT_H +#define QURT_EVENT_H +/** + @file qurt_event.h + @brief Prototypes of kernel event API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "qurt_consts.h" +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * System environment object type. + */ +/**@addtogroup sys_env_types +@{ */ +/** QuRT swap pool information type. */ +typedef struct qurt_sysenv_swap_pools { + /** @cond */ + unsigned int spoolsize; /* Swap pool size.*/ + unsigned int spooladdr; /* Swap pool start address.*/ + /** @endcond */ +}qurt_sysenv_swap_pools_t; + +/**QuRT application heap information type. */ +typedef struct qurt_sysenv_app_heap { + /** @cond */ + unsigned int heap_base; /* Heap base address.*/ + unsigned int heap_limit; /* Heap end address.*/ + /** @endcond */ +} qurt_sysenv_app_heap_t ; + +/** QuRT architecture version information type. */ +typedef struct qurt_sysenv_arch_version { + /** @cond */ + unsigned int arch_version; /*Architecture version.*/ + /** @endcond */ +}qurt_arch_version_t; + +/** QuRT maximum hardware threads information type. */ +typedef struct qurt_sysenv_max_hthreads { + /** @cond */ + unsigned int max_hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_max_hthreads_t; + +/** QuRT active hardware threads information type. */ +typedef struct qurt_sysenv_hthreads { + /** @cond */ + unsigned int hthreads; /*Maximum number of hardware threads.*/ + /** @endcond */ +}qurt_sysenv_hthreads_t; + +/** QuRT maximum pi priority information type. */ +typedef struct qurt_sysenv_max_pi_prio { + /** @cond */ + unsigned int max_pi_prio; /*Maximum pi priority.*/ + /** @endcond */ +}qurt_sysenv_max_pi_prio_t; + +/** QuRT process name information type. */ +typedef struct qurt_sysenv_procname { + /** @cond */ + union { + unsigned int asid; /*Address space ID.*/ + unsigned int pid; /*Process ID.*/ + }; + char name[QURT_MAX_NAME_LEN]; /* Process name.*/ + /** @endcond */ +}qurt_sysenv_procname_t; + +/** QuRT stack profile count information type. */ +typedef struct qurt_sysenv_stack_profile_count { + /** @cond */ + unsigned int count; /*Stack profile count for usage.*/ + unsigned int count_watermark; /*Stack profile count for watermark.*/ + /** @endcond */ +}qurt_sysenv_stack_profile_count_t; + +/** + QuRT system error event type. + */ +typedef struct _qurt_sysevent_error_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + } qurt_sysevent_error_t ; + +typedef struct _qurt_sysevent_error_1_t +{ + unsigned int thread_id; /**< Thread ID. */ + unsigned int fault_pc; /**< Fault PC. */ + unsigned int sp; /**< Stack pointer. */ + unsigned int badva; /**< Virtual data address where the exception occurred. */ + unsigned int cause; /**< QuRT error result. */ + unsigned int ssr; /**< Supervisor status register. */ + unsigned int fp; /**< Frame pointer. */ + unsigned int lr; /**< Link register. */ + unsigned int pid; /**< PID of the process to which this thread belongs.*/ + unsigned int fkey; /**< Framekey.*/ + unsigned int reserved1; /**< Reserved.*/ + unsigned int reserved2; /**< Reserved.*/ + unsigned int reserved3; /**< Reserved.*/ + } qurt_sysevent_error_1_t ; + +/** QuRT page fault error event information type. */ +typedef struct qurt_sysevent_pagefault { + qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */ + unsigned int fault_addr; /**< Accessed address that caused the page fault. */ + unsigned int ssr_cause; /**< SSR cause code for the page fault. */ +} qurt_sysevent_pagefault_t ; +/** @} */ /* @endaddtogroup sys_env_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/*======================================================================*/ +/** + Gets the environment swap pool 0 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools ); + +/* + Gets the environment swap pool 1 information from the kernel. + + @datatypes + #qurt_sysenv_swap_pools_t + + @param[out] pools Pointer to the pools information. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools ); + +/**@ingroup func_qurt_sysenv_get_app_heap + Gets information on the program heap from the kernel. + + @datatypes + #qurt_sysenv_app_heap_t + + @param[out] aheap Pointer to information on the program heap. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap ); + +/**@ingroup func_qurt_sysenv_get_arch_version + Gets the Hexagon processor architecture version from the kernel. + + @datatypes + #qurt_arch_version_t + + @param[out] vers Pointer to the Hexagon processor architecture version. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter + + @dependencies + None. +*/ +int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers); + +/**@ingroup func_qurt_sysenv_get_max_hw_threads + Gets the maximum number of hardware threads supported in the Hexagon processor. + The API includes the disabled hardware threads to reflect the maximum + hardware thread count. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, only HW0 and HW2 are initialized by QuRT. + HW1 and HW3 are not used at all. Under such a scenario, + qurt_sysenv_get_max_hw_threads() still returns four. + + @datatypes + #qurt_sysenv_max_hthreads_t + + @param[out] mhwt Pointer to the maximum number of hardware threads supported in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_hw_threads + Gets the number of hardware threads initialized by QuRT in Hexagon processor. + For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in + cust_config.xml, QuRT only initializes HW0 and HW2. + HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2. + + @datatypes + #qurt_sysenv_hthreads_t + + @param[out] mhwt Pointer to the number of hardware threads active in the Hexagon processor. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt ); + +/**@ingroup func_qurt_sysenv_get_max_pi_prio + Gets the maximum priority inheritance mutex priority from the kernel. + + @datatypes + #qurt_sysenv_max_pi_prio_t + + @param[out] mpip Pointer to the maximum priority inheritance mutex priority. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip ); + +/**@ingroup func_qurt_sysenv_get_process_name2 + Gets information on the system environment process names based on the client_handle argument. + + @datatypes + #qurt_sysenv_procname_t + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_process_name + Gets information on the system environment process names from the kernel. + + @datatypes + #qurt_sysenv_procname_t + + @param[out] pname Pointer to information on the process names in the system. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Invalid parameter. + + @dependencies + None. +*/ +int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname ); + +/**@ingroup func_qurt_sysenv_get_stack_profile_count + Gets information on the stack profile count from the kernel. + + @datatypes + #qurt_sysenv_stack_profile_count_t + + @param[out] count Pointer to information on the stack profile count. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count ); + +/**@ingroup func_qurt_exception_wait + Registers the program exception handler. + This function assigns the current thread as the QuRT program exception handler and suspends the + thread until a program exception occurs. + + When a program exception occurs, the thread is awakened with error information + assigned to the parameters of this operation. + + @note1hang If no program exception handler is registered, or if the registered handler + calls exit, QuRT raises a kernel exception. + If a thread runs in Supervisor mode, any errors are treated as kernel + exceptions. + + @param[out] ip Pointer to the instruction memory address where the exception occurred. + @param[out] sp Stack pointer. + @param[out] badva Pointer to the virtual data address where the exception occurred. + @param[out] cause Pointer to the QuRT error result code. + + @return + Registry status: \n + Thread identifier -- Handler successfully registered. \n + #QURT_EFATAL -- Registration failed. + + @dependencies + None. +*/ +unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp, + unsigned int *badva, unsigned int *cause); + +unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err); + +/**@ingroup func_qurt_exception_wait3 + Registers the current thread as the QuRT program exception handler, and suspends the thread until a + program exception occurs. + When a program exception occurs, the thread is awakened with error information assigned to the specified + error event record. + If a program exception is raised when no handler is registered (or when a handler is registered, but it calls + exit), the exception is treated as fatal.\n + @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n + @note1cont This function differs from qurt_exception_wait() by returning the error information in a data + structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR). + + @param[out] sys_err Pointer to the qurt_sysevent_error_1_t type structure. + @param[in] sys_err_size Size of the qurt_sysevent_error_1_t structure. + + @return + Registry status: \n + - #QURT_EFATAL -- Failure. \n + - Thread ID -- Success. + + @dependencies + None. +*/ + +unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size); + +/**@ingroup func_qurt_exception_raise_nonfatal + Raises a nonfatal program exception in the QuRT program system. + + For more information on program exceptions, see Section @xref{dox:exception_handling}. + + This operation never returns -- the program exception handler is assumed to perform all + exception handling before terminating or reloading the QuRT program system. + + @note1hang The C library function abort() calls this operation to indicate software + errors. + + @param[in] error QuRT error result code (Section @xref{dox:error_results}). + + @return + Integer -- Unused. + + @dependencies + None. +*/ +int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn)); + + +/**@ingroup func_qurt_exception_raise_fatal + Raises a fatal program exception in the QuRT system. + + Fatal program exceptions terminate the execution of the QuRT system without invoking + the program exception handler. + + For more information on fatal program exceptions, see Section @xref{dox:exception_handling}. + + This operation always returns, so the calling program can perform the necessary shutdown + operations (data logging, on so on). + + @note1hang Context switches do not work after this operation has been called. + + @return + None. + + @dependencies + None. +*/ +void qurt_exception_raise_fatal (void); + +unsigned int qurt_enable_floating_point_exception(unsigned int mask); + +/**@ingroup func_qurt_exception_enable_fp_exceptions + Enables the specified floating point exceptions as QuRT program exceptions. + + The exceptions are enabled by setting the corresponding bits in the Hexagon + control user status register (USR). + + The mask argument specifies a mask value identifying the individual floating + point exceptions to set. The exceptions are represented as defined symbols + that map into bits 0 through 31 of the 32-bit flag value. + Multiple floating point exceptions are specified by OR'ing together the individual + exception symbols.\n + @note1hang This function must be called before performing any floating point operations. + + @param[in] mask Floating point exception types. Values: \n + - #QURT_FP_EXCEPTION_ALL \n + - #QURT_FP_EXCEPTION_INEXACT \n + - #QURT_FP_EXCEPTION_UNDERFLOW \n + - #QURT_FP_EXCEPTION_OVERFLOW \n + - #QURT_FP_EXCEPTION_DIVIDE0 \n + - #QURT_FP_EXCEPTION_INVALID @tablebulletend + + @return + Updated contents of the USR. + + @dependencies + None. +*/ + +static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask) +{ + return qurt_enable_floating_point_exception(mask); +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EVENT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_except.h new file mode 100755 index 0000000000000..e1684c80e3d50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_except.h @@ -0,0 +1,185 @@ +#ifndef QURT_EXCEPT_H +#define QURT_EXCEPT_H + +/** + @file qurt_except.h + @brief Defines Cause and Cause2 codes for error-handling. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. + + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + QuRT supports error handling to handle CPU detected exceptions and software errors. + QuRT treats all errors as either fatal errors or nonfatal errors. + + @section sec1 Fatal errors + All supervisor mode exceptions are treated as fatal errors. + If a registered exception handler calls qurt_exit(), it is treated as a fatal error. + Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. + All hardware threads are eventually stopped and the cache is flushed. + NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n + + @subsection subsection1 Debugging fatal errors + - QURT_error_info.status.status -- Indicates that an error occured. + - QURT_error_info.status.cause -- Cause code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.cause2 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below. + - QURT_error_info.status.fatal -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered. + - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error. + - QURT_error_info.global_regs -- Contains the values of the global registers of Q6 + - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error. + + + + @subsection subsection2 Debugging nonfatal errors + - QURT_error_info.user_errors -- All user errors are logged here. + - QURT_error_info.user_errors.counter -- Index to last logged error. + - QURT_error_info.user_errors.entry[0...counter] -- Structure for logged error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb -- TCB for the user error. + - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID. + - QURT_error_info.user_errors.entry[0...counter].error_code -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below. + - QURT_error_info.user_errors.entry[0...counter].hw_thread -- Hardware thread ID for error. + - QURT_error_info.user_errors.entry[0...counter].pcycle -- Pcycle for error. + +@note + Important usage note: + Cause and Cause2 are error codes to distinguish multiple errors. + SSR and BADAVA are inconclusive without the vector number. + All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code. + Hence the system can have up to 255 * 255 unique error codes. + The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) ) + Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes. + SSR cause codes are defined in Hexagon reference manual. + All possible combinations are listed below. +*/ +/** @addtogroup chapter_error +@{ */ +/* cause - error type - 8-bits*/ +#define QURT_EXCEPT_PRECISE 0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/ +#define QURT_EXCEPT_NMI 0x02U /**< NMI occurred; Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS 0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_RSVD_VECTOR 0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */ +#define QURT_EXCEPT_ASSERT 0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below. */ +#define QURT_EXCEPT_BADTRAP 0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */ +#define QURT_EXCEPT_UNDEF_TRAP1 0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */ +#define QURT_EXCEPT_EXIT 0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */ +#define QURT_EXCEPT_TLBMISS_X 0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */ +#define QURT_EXCEPT_STOPPED 0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */ +#define QURT_EXCEPT_FATAL_EXIT 0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */ +#define QURT_EXCEPT_INVALID_INT 0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */ +#define QURT_EXCEPT_FLOATING_POINT 0x0EU /**< Kernel received an floating point error. Cause2 is not defined. */ +#define QURT_EXCEPT_DBG_SINGLE_STEP 0x0FU /**< Cause2 is not defined. */ +#define QURT_EXCEPT_TLBMISS_RW_ISLAND 0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */ +#define QURT_EXCEPT_TLBMISS_X_ISLAND 0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */ +#define QURT_EXCEPT_SYNTHETIC_FAULT 0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */ +#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */ +#define QURT_EXCEPT_UNDEF_TRAP0 0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */ +#define QURT_EXCEPT_PRECISE_DMA_ERROR 0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */ + +#define QURT_ECODE_UPPER_LIBC (0U << 16) /**< Upper 16 bits is 0 for libc. */ +#define QURT_ECODE_UPPER_QURT (0U << 16) /**< Upper 16 bits is 0 for QuRT. */ +#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16) /**< Upper 16 bits is 2 for error service. */ +/** @cond */ +#define QURT_ECODE_ISLAND_INVALID_QDI 3U /**< Passing invalid QDI method in island. */ +/** @endcond */ + +/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */ +#define QURT_SYNTH_ERR 0x01U /**< */ +#define QURT_SYNTH_INVALID_OP 0x02U /**< */ +#define QURT_SYNTH_DATA_ALIGNMENT_FAULT 0x03U /**< */ +#define QURT_SYNTH_FUTEX_INUSE 0x04U /**< */ +#define QURT_SYNTH_FUTEX_BOGUS 0x05U /**< */ +#define QURT_SYNTH_FUTEX_ISLAND 0x06U /**< */ +#define QURT_SYNTH_FUTEX_DESTROYED 0x07U /**< */ +#define QURT_SYNTH_PRIVILEGE_ERR 0x08U /**< */ + +/* Cause2 - Abort cause reason - 8 bits */ +/* ERR_ASSERT cause */ +#define QURT_ABORT_FUTEX_WAKE_MULTIPLE 0x01U /**< Abort cause - futex wake multiple. */ +#define QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE 0x02U /**< Abort cause - thread waiting to wake up in Single Threaded mode. */ +#define QURT_ABORT_TCXO_SHUTDOWN_NOEXIT 0x03U /**< Abort cause - call TCXO shutdown without exit. */ +#define QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL 0x04U /**< Abort cause - futex allocation queue failure - QURTK_futexhash_lifo empty. */ +#define QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT 0x05U /**< Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */ +#define QURT_ABORT_THREAD_SCHEDULE_SANITY 0x06U /**< Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */ +#define QURT_ABORT_REMAP 0x07U /**< Remap in the page table; the correct behavior must remove mapping if necessary. */ +#define QURT_ABORT_NOMAP 0x08U /**< No mapping in page table when removing a user mapping. */ +#define QURT_ABORT_OUT_OF_SPACES 0x09U +#define QURT_ABORT_INVALID_MEM_MAPPING_TYPE 0x0AU /**< Invalid memory mapping type when creating qmemory. */ +#define QURT_ABORT_NOPOOL 0x0BU /**< No pool available to attach. */ +#define QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM 0x0CU /**< Cannot allocate more futex waiting queue. */ +#define QURT_ABORT_ARG_ERROR 0x0DU +#define QURT_ABORT_ASSERT 0x0EU /**< Assert abort. */ +#define QURT_ABORT_FATAL 0x0FU /**< Fatal error; must never occur. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE 0x10U /**< Abort cause - invalid queue ID in futex resume. */ +#define QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE 0x11U /**< Abort cause - invalid queue ID in futex wait. */ +#define QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX 0x12U /**< Abort cause - invalid futex object in hashtable. */ +#define QURT_ABORT_NO_ERHNDLR 0x13U /**< No registered error handler. */ +#define QURT_ABORT_ERR_REAPER 0x14U /**< Exception in the reaper thread. */ +#define QURT_ABORT_FREEZE_UNKNOWN_CAUSE 0x15U /**< Abort in thread freeze operation. */ +#define QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE 0x16U /**< During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */ +#define QURT_ABORT_ERR_ISLAND_EXP_HANDLER 0x17U /**< Exception in Island exception handler task. */ +#define QURT_ABORT_L2_TAG_DATA_CHECK_FAIL 0x18U /**< Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */ +#define QURT_ABORT_ERR_SECURE_PROCESS 0x19U /**< Abort error in secure process. */ +#define QURT_ABORT_ERR_EXP_HANDLER 0x20U /**< No exception handler, or the handler caused an exception. */ +#define QURT_ABORT_ERR_NO_PCB 0x21U /**< PCB of the thread context failed initialization, PCB was NULL. */ +#define QURT_ABORT_NO_PHYS_ADDR 0x22U /**< Unable to find the physical address for the virtual address. */ +#define QURT_ABORT_OUT_OF_FASTINT_CONTEXTS 0x23U /**< Fast interrupt contexts exhausted. */ +#define QURT_ABORT_CLADE_ERR 0x24U /**< Fatal error seen with CLADE interrupt. */ +#define QURT_ABORT_ETM_ERR 0x25U /**< Fatal error seen with ETM interrupt. */ +#define QURT_ABORT_ECC_DED_ASSERT 0x26U /**< ECC two-bit DED error. */ +#define QURT_ABORT_VTLB_ERR 0x27U /**< Fatal error in the VTLB layer. */ +#define QURT_ABORT_TLB_ENCODE_DECODE_FAILURE 0x28U /**< Failure during the TLB encode or decode operation. */ +#define QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE 0x29U /**< Failure to lookup entry in the page table. */ +#define QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE 0x30U /**< Failure to claim phy memory ownership. */ +#define QURT_ABORT_JTLB_SIZE_CHECK_FAIL 0x31U /**< JTLB size configured is more than actual size in hardware */ +#define QURT_ABORT_AUTOSTACK_ASSERT 0x32U /**< Error while handling stack flimit exception. */ + +/* Cause2 - TLB-miss_X - 8bits */ +#define QURT_TLB_MISS_X_FETCH_PC_PAGE 0x60U /**< */ +#define QURT_TLB_MISS_X_2ND_PAGE 0x61U /**< */ +#define QURT_TLB_MISS_X_ICINVA 0x62U /**< */ + +/* Cause2 - TLB-miss_RW - 8bits */ +#define QURT_TLB_MISS_RW_MEM_READ 0x70U /**< */ +#define QURT_TLB_MISS_RW_MEM_WRITE 0x71U /**< */ + +/** @cond rest_reg_dist */ +/* Cause2 - Floating point exception - 8 bits */ +#define QURT_FLOATING_POINT_EXEC_ERR 0xBFU /**< Execute floating-point. */ +/** @endcond */ + +/** Cause2 - autostackv2 - 8 bits */ +#define QURT_AUTOSTACKV2_CANARY_NOT_MATCH 0xC1U +#define QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE 0xC2U + +/** Cause2 - CFI violation - 8 bits */ +#define QURT_CFI_VIOLATION 0xC3U + +/** @cond rest_reg_dist*/ +/* Enable floating point exceptions */ +#define QURT_FP_EXCEPTION_ALL 0x1FU << 25 /**< */ +#define QURT_FP_EXCEPTION_INEXACT 0x1U << 29 /**< */ +#define QURT_FP_EXCEPTION_UNDERFLOW 0x1U << 28 /**< */ +#define QURT_FP_EXCEPTION_OVERFLOW 0x1U << 27 /**< */ +#define QURT_FP_EXCEPTION_DIVIDE0 0x1U << 26 /**< */ +#define QURT_FP_EXCEPTION_INVALID 0x1U << 25 /**< */ + +/** @endcond */ +/** @} */ /* end_addtogroup chapter_error */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_EXCEPT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fastint.h new file mode 100755 index 0000000000000..ea65dc0917fc0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fastint.h @@ -0,0 +1,71 @@ +#ifndef QURT_FASTINT_H +#define QURT_FASTINT_H + +/** + @file qurt_fastint.h + @brief QuRT fast interrupt functions + + Copyright (c) 2013-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + + ======================================================================*/ + +/*======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_fastint_register + Register fast interrupt callback function + + Fast interrupt callback should be designed to perform the minimal necessary + actions for the interrupt, and/or perform some operations, such as signaling + another regular software thread to start any additional processing. + The callback should be a fast and short function. When a fast interrupt callback + is running, the corresponding interrupt cannot be re-enabled until the callback + returns. + + The fast interrupt callback must not use any system blocking calls, such as + mutex lock or signal wait. Otherwise, it results in errors. + + The fast interrupt callback function has a single integer argument and the + function ends with no return. The argument value passed in is the interrupt + number, and therefore a single callback function can handle + multiple fast interrupts. + + @param[in] intno Interrupt number to register. + @param[in] fn Interrupt callback function. + + @return + #QURT_EOK -- Fast interrupt registration is successful. \n + #QURT_EINVALID -- Interrupt is already registered. \n + #QURT_EINT -- Invalid interrupt number. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_register(int intno, void (*fn)(int)); + + +/*======================================================================*/ +/**@ingroup func_qurt_fastint_deregister + Deregisters the fast interrupt callback function. + + @param[in] intno Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 + (simulator only). + + @return + #QURT_EOK -- Interrupt deregistration is successful. \n + #QURT_EINT -- Invalid interrupt number (not registered). \n + #QURT_EINVALID -- Invalid interrupt number (already deregistered). + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_fastint_deregister(int intno); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FASTINT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fs_hub.h new file mode 100755 index 0000000000000..aaa050a6c838b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fs_hub.h @@ -0,0 +1,58 @@ +#ifndef QURT_FS_HUB_H +#define QURT_FS_HUB_H + +/** + @file qurt_fs_hub.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver that provides file-system functionality. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + This structure tracks a file-designator for a FS-hub QDI driver. + File system's QDI interface should use this object to encapsulate + true file-descriptor and return back a QDI handle. This QDI handle + will be used as file-descriptor by File-systm-hub. + */ + +typedef struct qurt_qdi_fs_obj +{ + qurt_qdi_obj_t qdi_obj; + int client_handle; + int fd; +}qurt_qdi_fs_obj_t; + + +/**@ingroup fs_hub_support_functions + This function allows a file-system to register it's QDI interface with file-system-hub. + Once registered, all file open operations for any filenames containing the mountpoint will + be forwarded to the QDI inteface. + + Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/" + + @param mtpoint mount point for the file-system being registered. + @param opener opener structure for the QDI driver interface + + @return + QURT_EOK -- Successfully registered QDI driver with file-system-hub. + Negative error code -- Failed to register with file-system-hub + */ +int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_futex.h new file mode 100755 index 0000000000000..1fdcc79a43f01 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_futex.h @@ -0,0 +1,82 @@ +#ifndef QURT_FUTEX_H +#define QURT_FUTEX_H +/** + @file qurt_futex.h + + @brief Prototypes of QuRT futex API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2020-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Functions +======================================================================*/ + + +/**@ingroup func_qurt_futex_wait + Moves the caller thread into waiting state when a memory object address + contains a value that is the same as a specified value. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait(void *lock, int val); + + +/**@ingroup func_qurt_futex_wait_cancellable + If a memory object address contains a value that is same as a specified + value, move the caller thread into waiting state. + The kernal can cancel the waiting state when there is a special need. + + @param[in] lock Pointer to the object memory. + @param[in] val Value to check against the object content. + + @return + #QURT_EOK -- Success \n + Other values -- Failure + + @dependencies + None. + */ +int qurt_futex_wait_cancellable(void *lock, int val); + + +/**@ingroup func_qurt_futex_wake + Wakes up a specified number of threads that have been waiting + for the object change with qurt_futex_wait(). + + @param[in] lock Pointer to the object memory. + @param[in] n_to_wake Maximum number of threads to wake up. + + @return + number of threads to be woken up by this function + + @dependencies + None. + */ +int qurt_futex_wake(void *lock, int n_to_wake); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_FUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hmx.h new file mode 100755 index 0000000000000..e4037dbeae514 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hmx.h @@ -0,0 +1,226 @@ +#ifndef QURT_HMX_H +#define QURT_HMX_H +/** + @file qurt_hmx.h + @brief Prototypes of Qurt HMX API. + +Copyright (c) 2019-2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + + +/** @addtogroup hmx_types +@{ */ +/* HMX locking type */ +#define QURT_HMX_NON_SHARED_LOCK 0U /**< HMX locking type.*/ +#define QURT_HMX_SHARED_LOCK 1U /**< HMX locking type.*/ + +/* HMX unlocking type */ +#define QURT_HMX_NON_SHARED_UNLOCK 0U /**< HMX unlocking type.*/ +#define QURT_HMX_SHARED_UNLOCK 1U /**< HMX unlocking type.*/ + +/* HMX hardware context */ +#define QURT_HMX_UNIT_0 0U /**< HMX hardware context #0 */ +#define QURT_HMX_UNIT_1 1U /**< HMX hardware context #1 */ + /** @} */ /* end_addtogroup hmx_types */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_hmx_lock2 + Locks a HMX unit with the specified locking type. + + #QURT_HMX_NON_SHARED_LOCK: + - If a HMX unit is available, lock the unit and return success of #QURT_EOK. + - If the HMX unit is already locked by another thread, the caller thread is suspended + until the HMX is available and gets locked by this function. + - If there is no HMX hardware supported, returns #QURT_EVAL; + + #QURT_HMX_SHARED_LOCK: + - If a HMX unit is available, enables HMX access for the caller thread, and returns + success of #QURT_EOK. + - If the HMX is enabled on the caller thread, return #QURT_EFAILED. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller + thread, and return success of #QURT_EOK. + - If the HMX is locked by another thread in the same user process of the caller + thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED. + - If the HMX is locked by a thread from another user process different from the + user process of the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX lock successful.\n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_lock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_unlock2 + Unlocks a HMX unit with the unlocking type. + + #QURT_HMX_NON_SHARED_UNLOCK: + - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the + HMX accumulators (assuming a fixed point type). + - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. + - If there is no HMX hardware supported, return #QURT_EVAL. + + #QURT_HMX_SHARED_UNLOCK: + - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the + HMX access on the caller thread, and return success of #QURT_EOK. + Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK + in its user process, the unlock function clears the HMX accumulators. + - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return + failure of #QURT_EFAILED. + - If the caller thread has not locked HMX, return failure of #QURT_EFAILED. + - If there is no HMX hardware supported, returns #QURT_EVAL. + + @param[in] type Locking type. + + @return + #QURT_EOK -- HMX is unlocked successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + + */ +int qurt_hmx_unlock2(unsigned int type); + + +/**@ingroup func_qurt_hmx_lock + Locks a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away. + If there is no HMX unit available, the caller is blocked until a HMX is available + and is locked by the function. + + @return + #QURT_EOK -- HMX lock successful. \n + #QURT_EFAILED -- Failure due to wrong locking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_lock(void); + + +/**@ingroup func_qurt_hmx_unlock + Unlocks a HMX unit. + If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its + accumulators(assuming fixed point type). + If there is no HMX unit locked by the caller thread, return failure. + + @return + #QURT_EOK -- HMX unlock successful. \n + #QURT_EFAILED -- Failure due to wrong unlocking condition. \n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_unlock(void); + + +/**@ingroup func_qurt_hmx_try_lock + Tries to lock a HMX unit. + If a HMX unit is available, this function locks the unit and returns right away; + if there is no HMX unit available, the function returns failure without blocking the caller. + + @return + #QURT_EOK -- HMX lock successful \n + #QURT_EFAILED -- Failure due to wrong locking condition.\n + #QURT_EVAL -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_try_lock(void); + + +/**@ingroup func_qurt_hmx_assign + Assign a HMX unit to a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, + kernel will detach it from the thread, and re-assign it to the target thread. + If the target thread has HVX enabled, it cannot have HMX enabled. + + Locking type + #QURT_HMX_NON_SHARED_LOCK: + - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK. + - If the HMX unit is already enabled on the target thread, return #QURT_EOK. + - If the HMX unit is already locked by another thread, detach the HMX from the thread. + Re-assign the HMX unit to the target thread, and return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] type Locking type + #QURT_HMX_NON_SHARED_LOCK -- non-shared lock + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is assigned successfully. This includes the case that \n + the target thread already has HMX assigned. \n + #QURT_EFAILED -- Failure due to wrong assigning conditions. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit ); + + +/**@ingroup func_qurt_hmx_release + Release a HMX unit from a target thread specified by its thread identifier. + The HMX unit (HMX hardware context) is specified by hmx_unit. + The caller of this function is limited to the SRM process. + + Qurt detaches the specified HMX unit from the target thread, and return success of + #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK. + + @param[in] thread_id Thread identifier + @param[in] hmx_unit HMX hardware context number + #QURT_HMX_UNIT_0 + #QURT_HMX_UNIT_1 + + @return + #QURT_EOK -- The HMX is released successfully. This includes the case that \n + the target thread already has the HMX released. \n + #QURT_EFAILED -- Failure due to wrong assigning condition. \n + #QURT_EINVALID -- Failure because no HMX hardware is supported. + + @dependencies + None. + */ +int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit ); + + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HMX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hvx.h new file mode 100755 index 0000000000000..13c213d49ac84 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hvx.h @@ -0,0 +1,421 @@ +#ifndef QURT_HVX_H +#define QURT_HVX_H +/** + @file qurt_hvx.h + @brief Prototypes of QuRT HVX API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @cond */ + +typedef enum { + QURT_HVX_MODE_64B = 0, /**< HVX mode of 64 bytes */ + QURT_HVX_MODE_128B = 1 /**< HVX mode of 128 bytes */ +} qurt_hvx_mode_t; +/** @endcond */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @cond internal_only*/ +/** @addtogroup hvx_macros +@{ */ +#define QURT_HVX_HW_UNITS_2X128B_4X64B 0x00000204 /**< Bits 15 through 8 are for the number of 128B units. */ + /**< Bits 7 through 0 are for the number of 64B units. */ +#define QURT_HVX_HW_UNITS_4X128B_0X64B 0x00000400 +#define QURT_HVX_HW_UNITS_6X128B_0X64B 0x00000600 + +/* HVX locking status */ + +#define QURT_HVX_UNLOCKED (0) /* Has not locked HVX unit */ +#define QURT_HVX_LOCKED (1) /* Has locked HVX unit */ +#define QURT_HVX_ERROR (-1) /* Error, no HVX support */ + +/* Input value for HVX reservation */ + +#define QURT_HVX_RESERVE_ALL (4) /* All the HVX units in terms of 64B_MODE are requested to be reserved */ +#define QURT_HVX_RESERVE_ALL_AVAILABLE (0xff) /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */ + +/* Return values for HVX reservation */ + +#define QURT_HVX_RESERVE_NOT_SUPPORTED (-1) /* There is no HVX hardware, or less units in the hardware than requested */ +#define QURT_HVX_RESERVE_NOT_SUCCESSFUL (-2) /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */ +#define QURT_HVX_RESERVE_ALREADY_MADE (-3) /* There is already a HVX reservation made. */ +#define QURT_HVX_RESERVE_CANCEL_ERR (-4) /* The action of cancling the reservation fails because this protection domain has no reservation made before. */ + +// HVX set requests + +#define QURT_HVX_64B 0 /**< */ +#define QURT_HVX_128B 1 /**< */ +#define QURT_HVX_NO_USE 2 /**< */ +#define QURT_HVX_RELEASE_CONTEXT 3 /**< */ +#define QURT_HVX_IMMEDIATE_USE 4 /**< */ + +// HVX set masks + +#define QURT_HVX_64B_PREFERRED (1<<(QURT_HVX_64B + 8))/**< */ +#define QURT_HVX_128B_PREFERRED (1<<(QURT_HVX_128B + 8))/**< */ +#define QURT_HVX_64B_ACCEPTABLE (1<<(QURT_HVX_64B + 12))/**< */ +#define QURT_HVX_128B_ACCEPTABLE (1<<(QURT_HVX_128B + 12))/**< */ + +// HVX set return "result" + +#define QURT_EOK 0 /**< */ +#define QURT_HVX_SET_ERROR 0xFF /**< */ + +// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE +#define QURT_HVX_64B_ASSIGNED (1<<(QURT_HVX_64B + 8)) /**< */ +#define QURT_HVX_128B_ASSIGNED (1<<(QURT_HVX_128B + 8)) /**< */ + +// Sizes of HVX dump buffer + +#define QURT_HVX_V65_64B_VSIZE 2084U /**< 64 x 32 + 8 x 4 + 4 (version). */ +#define QURT_HVX_V65_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V66_128B_VSIZE 4420U /**< 128 x (32 +2) + 16 x 4 + 4 (version). */ +#define QURT_HVX_V68_128B_VSIZE 4164U /**< 128 x 32 + 16 x 4 + 4 (version). */ +#define QURT_HVX_V79_128B_VSIZE 4740U /**< 128 x (32+4+1) + 4 (version). */ +#define QURT_HVX_VREG_BUF_SIZE QURT_HVX_V79_128B_VSIZE /**< */ + +// HVX dump versions + +#define QURT_HVX_DUMP_V65_64B 1U /**< */ +#define QURT_HVX_DUMP_V65_128B 2U /**< */ +#define QURT_HVX_DUMP_V66_128B 3U /**< */ +#define QURT_HVX_DUMP_V68_128B 4U /**< */ +#define QURT_HVX_DUMP_V79_128B 5U /**< */ +/** @} */ /* end_addtogroup hvx_macros */ +/** @endcond */ +/** @cond */ +// Qurt data struct for hvx_set input +typedef struct qurt_hvx_set_struct_ { + unsigned char set_req; // LSB + struct { + unsigned char preferred_mask:4; + unsigned char acceptable_mask:4; + }; + unsigned short resvd; // MSB +} qurt_hvx_set_struct_t; // 4 bytes + + +// Qurt data struct for hvx_set return +typedef struct qurt_hvx_set_return_str_ { + unsigned char result; // LSB + unsigned char hvx_mode_assigned; + unsigned short resvd; // MSB +} qurt_hvx_set_return_struct_t; // 4 bytes +/** @endcond */ + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_hvx_lock + Locks one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns right away. + If the current HVX mode is different from the requested mode, the current + thread is blocked. When all HVX units become idle, QuRT changes + the mode, locks the HVX unit, and returns. + + Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is + mapped as qurt_hvx_set(64_BYTE or 128_BYTE). + + @datatypes + #qurt_mode_t + + @param[in] lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B. + + @return + #QURT_EOK -- Success \n + Other value -- Failure + + @dependencies + None. + + */ +int qurt_hvx_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_unlock + Unlocks the HVX unit held by this software thread. + + @note1hang Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock() + maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT). + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_unlock(void); + +/**@ingroup func_qurt_hvx_try_lock + Tries to lock one HVX unit specified by the HVX mode. + + @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode + is available, this function locks the unit and returns #QURT_EOK; Otherwise, + the function returns a failure, but does not block the current software + thread to wait for the HVX unit. + Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock() + maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask); + + @datatypes + #qurt_mode_t + + @return + #QURT_EOK -- Successful return \n + Other values -- Failure + + @dependencies + None. + + */ +int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode); + +/**@ingroup func_qurt_hvx_get_mode + Gets the current HVX mode configured by QuRT. + + @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on + the current HVX configuration. + + @param[out] + None. + + @return + #QURT_HVX_MODE_128B \n + #QURT_HVX_MODE_64B \n + -1 -- Not available. + + @dependencies + None. + */ +int qurt_hvx_get_mode(void); + + +/**@ingroup func_qurt_hvx_get_units + Gets the HVX hardware configuration that the chipset supports. + + @note1hang The function returns the HVX hardware configuration supported by the chipset. + + @return + Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n + - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n + - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n + - 0 -- not available + + @dependencies + None. + + */ +int qurt_hvx_get_units(void); + + +/**@ingroup func_qurt_hvx_reserve + Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + If one HVX unit is already locked by the application in the same PD, the unit is + added to the returned count as one reserved unit for the PD. + Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve() + only does basic sanity checks on HVX units. + + @datatypes + None. + + @param[in] num_units Number of HVX units in terms of 64B_MODE to reserve for the PD. + QURT_HVX_RESERVE_ALL to reserve all the HVX units. + QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units. + + @return + Number of units successfully reserved, including the units already locked in the same PD. \n + #QURT_HVX_RESERVE_NOT_SUPPORTED \n + #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n + #QURT_HVX_RESERVE_ALREADY_MADE + + + @dependencies + None. + + */ +int qurt_hvx_reserve(int num_units); + + +/**@ingroup func_qurt_hvx_cancel_reserve + Cancels the HVX reservation in the protection domain (PD) of the caller. + + @note1hang Only one HVX reservation in the system is supported. + + @return + 0 -- Success \n + #QURT_HVX_RESERVE_CANCEL_ERR -- Failure + + @dependencies + None. + + */ +int qurt_hvx_cancel_reserve(void); + + +/**@ingroup func_qurt_hvx_get_lock_val + Gets the HVX locking status value of the thread of the caller. + + @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not. + + @datatypes + None. + + @return + #QURT_HVX_UNLOCKED \n + #QURT_HVX_LOCKED \n + #QURT_HVX_ERROR + + @dependencies + None. + */ +int qurt_hvx_get_lock_val(void); + +/** @cond internal_only*/ +/**@ingroup func_qurt_hvx_set + Sets the HVX configuration for the software thread of the caller. + + @datatypes + None. + + @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask + | hvx_acceptable_mode_mask where set_request can be set to: \n + - #QURT_HVX_64B \n + - #QURT_HVX_128B \n + - #QURT_HVX_NO_USE \n + - #QURT_HVX_RELEASE_CONTEXT \n + - #QURT_HVX_IMMEDIATE_USE \n + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_preferred_mode_mask can be set to: \n + - #QURT_HVX_64B_PREFERRED \n + - #QURT_HVX_128B_PREFERRED + When set_request is QURT_HVX_IMMEDIATE_USE, + hvx_acceptable_mode_mask can be set to: \n + - #QURT_HVX_64B_ACCEPTABLE \n + - #QURT_HVX_128B_ACCEPTABLE @tablebulletend + + @return + Result of the HVX setting in the least significant 8 bits of the returned data. \n + #QURT_EOK -- 0 \n + #QURT_HVX_SET_ERROR -- 0xFF \n + When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, + bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n + - #QURT_HVX_64B_ASSIGNED \n + - #QURT_HVX_128B_ASSIGNED + + @dependencies + None. + */ +unsigned int qurt_hvx_set(unsigned int input_arg); + + +/**@ingroup func_qurt_system_hvx_regs_get_maxsize + Returns the maximum buffer size for saving HVX registers. + + @datatypes + None. + + @return + 0 -- No HVX supported in the target. \n + #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers. + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get_maxsize(void); + + +/**@ingroup func_qurt_system_hvx_regs_get_size + Returns the buffer size for saving HVX registers for a specified thread. + + @param[in] thread_id Thread ID of the target thread. + + @return + 0 -- No HVX assgined to the thread. \n + size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n + - #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + + @dependencies + None. + + */ +unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id); + + + +/**@ingroup func_qurt_system_hvx_regs_get + Saves the HVX registers into the specified buffer. + Returns the size of the data saved into the buffer. + After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer + from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0. + + @param[in] thread_id Thread ID of the target thread. + @param[in] pBuf Pointer to the buffer for HVX register saving. + The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from + the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. + For example, a buffer can be declared at first as: \n + unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n + unsigned char *pBuf; \n + then align the buffer pointer to: \n + pBuf = vbuf; \n + pBuf += (256 - 4 - (unsigned)pBuf%256); + @param[in] size Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that + returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above. + @param[out] pBuf Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith + byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes + contain one of the HVX dump versions:\n + - #QURT_HVX_DUMP_V65_64B \n + - #QURT_HVX_DUMP_V65_128B \n + - #QURT_HVX_DUMP_V66_128B \n + - #QURT_HVX_DUMP_V68_128B \n + - #QURT_HVX_DUMP_V79_128B \n + @tablebulletend + + @return + Total bytes of the data saved in the provided buffer. \n + 0 -- No HVX assigned to the thread \n + #QURT_HVX_V65_64B_VSIZE -- 64 x 32 + 8 x 4 + 4 (version) \n + #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n + #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n + #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version) + + @dependencies + None. + */ +unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_HVX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_int.h new file mode 100755 index 0000000000000..386aeda1051eb --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_int.h @@ -0,0 +1,509 @@ +#ifndef QURT_INT_H +#define QURT_INT_H +/** + @file qurt_int.h + @brief QuRT interrupt functions. + + + + Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/** @cond rest_reg_dist */ +/** @addtogroup interrupts_constants +@{ */ +#define SIG_INT_ABORT 0x80000000 /**< */ +#define QURT_INT_NON_DELAYED_ACK 0 +#define QURT_INT_DELAYED_ACK 1 +#define QURT_INT_ACK_DEFAULT QURT_INT_NON_DELAYED_ACK +#define QURT_INT_DRV_DEFAULT 0 +#define QURT_INT_PRIORITY_DEFAULT 0xFF + +/** QuRT interrupt property. */ +#define QURT_INT_CONFIGID_POLARITY 0x1U /**< */ +#define QURT_INT_CONFIGID_LOCK 0x2U /**< */ + +/** QuRT interrupt lock.*/ +#define QURT_INT_LOCK_DEFAULT 0x0 /**< Default. */ +#define QURT_INT_LOCK_DISABLE 0x0 /**< Interrupt can be enabled or disabled or deregistered. */ +#define QURT_INT_LOCK_ENABLE 0x1 /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/ +/** @} */ /* end_addtogroup interrupts_constants */ + +/** @addtogroup Qurt_interrupt_type +@{ */ +/** Trigger type bit fields for a PDC interrupt:\n + @verbatim + Polarity Edge Output\n + 0 00 Level sensitive active low + 0 01 Rising edge sensitive + 0 10 Falling edge sensitive + 0 11 Dual edge sensitive + 1 00 Level sensitive active high + 1 01 Falling edge sensitive + 1 10 Rising edge sensitive + 1 11 Dual edge sensitive + @endverbatim +*/ +#define QURT_INT_TRIGGER_TYPE_SET(pol, edge) ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */ + +#define QURT_INT_TRIGGER_LEVEL_LOW QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_LEVEL_HIGH QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U) /**< */ +#define QURT_INT_TRIGGER_RISING_EDGE QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_FALLING_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U) /**< */ +#define QURT_INT_TRIGGER_DUAL_EDGE QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U) /**< */ +#define QURT_INT_TRIGGER_USE_DEFAULT 0xffU /**< */ +/** @} */ /* end_addtogroup Qurt_interrupt_type */ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_interrupt_register + @xreflabel{sec:interrupt_register} + Registers the interrupt.\n + Enables the specified interrupt and associates it with the specified QuRT signal object and + signal mask. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask); + +/**@ingroup func_qurt_interrupt_register2 + @xreflabel{sec:interrupt_register2} + Registers the interrupt.\n + Enables the specified interrupt, associates it with the specified QuRT signal object and + signal mask, and sets interrupt flags. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be waited on, and 0 indicates not to wait. + + When the interrupt occurs, the signal specified in the signal mask is set in the signal + object. An IST conventionally waits on that signal to + handle the interrupt. The thread that registers the interrupt is set as the IST. + + Up to 31 separate interrupts can be registered to a single signal object, as determined by + the number of individual signals that the object can store. QuRT reserves signal 31. Thus a + single IST can handle several different interrupts. + + QuRT reserves some interrupts for internal use -- the remainder are available for use by + applications, and thus are valid interrupt numbers. If the specified interrupt number is + outside the valid range, the register operation returns the status value #QURT_EINT. + + Only one thread can be registered at a time to a specific interrupt. Attempting to register + an already-registered interrupt returns the status value #QURT_EVAL. + + Only one signal bit in a signal object can be registered at a time to a specific interrupt. + Attempting to register multiple signal bits to an interrupt returns the status value + #QURT_ESIG. + + When the signal registers an interrupt, QuRT can only set its signal bits + when receiving the interrupt. The QuRT signal API from another + software thread cannot set the signal even for unused signal bits. + + @note1hang The valid range for an interrupt number can differ on target execution + environments other than the simulator. For more information, see the + appropriate hardware document. + + @datatypes + #qurt_anysignal_t + + @param[in] int_num L2VIC interrupt to deregister; valid range is 0 to 1023. + @param[in] int_signal Any-signal object to wait on (Section @xref{dox:any_signals}). + @param[in] signal_mask Signal mask value indicating signal to receive the interrupt. + @param[in] flags Defines interrupt property, supported property is interrupt lock enable/disable. + Possible values for flags: \n + - #QURT_INT_LOCK_ENABLE + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Interrupt successfully registered.\n + #QURT_EINT -- Invalid interrupt number. \n + #QURT_ESIG -- Invalid signal bitmask (cannot set more than one + signal at a time). \n + #QURT_EVAL -- Interrupt already registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags); +/* + * Waits for registered interrupt signal + + * Suspend the current thread until one of its registered interrupts occurs. The second input mask, + * contains the interrupt signals the IST expects to receive. The interrupt signals are registered + * with interrupts via qurt_register_interrupt API. + * + * The signals returned in the signal variable indicate which interrupts occurred. Use function + * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to + * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST + * must quit from interrupt receiving loop. + * + * For detail information on this API, see QuRT User Manual Section 4.2.5 + * + * Prototype + * + * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask) + */ + +/**@ingroup func_qurt_interrupt_acknowledge + Acknowledges an interrupt after it has been processed.\n + Re-enables an interrupt and clears its pending status. This is done after an interrupt is + processed by an IST. + + Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST + performs the acknowledge operation after it has finished processing the interrupt and + just before suspending itself (such as by waiting on the interrupt signal). + + @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt, + an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before + acknowledging the interrupt. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Interrupt acknowledge was successful. \n + #QURT_EDEREGISTERED -- Interrupt is already de-registered. + + @dependencies + None. +*/ +int qurt_interrupt_acknowledge(int int_num); + +/**@ingroup func_qurt_interrupt_deregister + Disables the specified interrupt and disassociates it from a QuRT signal object. + If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation + returns the status value #QURT_EINT. + + @note1hang If an interrupt is deregistered while an IST waits + to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid + this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an + IST after determining that it has no interrupts registered. + + @param[in] int_num L2VIC to deregister; valid range is 0 to 1023. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number (not registered). + + @dependencies + None. + +*/ +unsigned int qurt_interrupt_deregister(int int_num); +/** @endcond */ + +/**@ingroup func_qurt_interrupt_disable + Disables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + After qurt_interrupt_disable() returns, the Hexagon subsystem + can no longer send the corresponding interrupt to the Hexagon + core, until qurt_interrupt_enable() is called + for the same interrupt. + + Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within + a short period of time.\n + - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() + is called. Therefore, some time later, the pending interrupt is received on a Hexagon + hardware thread.\n + - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon + hardware automatically disables the interrupt until kernel software re-enables the interrupt + at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain + thread at an ealier time, the interrupt is re-enabled earlier and can trigger + sending a new interrupt to the Hexagon core while kernel software is still processing + the previous interrupt. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully disabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. +*/ + unsigned int qurt_interrupt_disable(int int_num); + + +/**@ingroup func_qurt_interrupt_enable + Enables an interrupt with its interrupt number.\n + The interrupt must be registered prior to calling this function. + + @param[in] int_num Interrupt number. + + @return + #QURT_EOK -- Interrupt successfully enabled.\n + #QURT_EINT -- Invalid interrupt number.\n + #QURT_ENOTALLOWED -- Interrupt is locked. \n + #QURT_EVAL -- Interrupt is not registered. + + @dependencies + None. + +*/ + unsigned int qurt_interrupt_enable(int int_num); + + +/**@ingroup func_qurt_interrupt_status + Returns a value that indicates the pending status of the specified interrupt. + + @param[in] int_num Interrupt number that is being checked. + @param[out] status Interrupt status; 1 indicates that an interrupt is + pending, 0 indicates that an interrupt is not pending. + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_status(int int_num, int *status); + + +/**@ingroup func_qurt_interrupt_get_status + Gets the status of the specified interrupt in L2VIC. + + @param[in] int_num Interrupt number that is being checked. + @param[in] status_type 0 -- interrupt pending status \n + 1 -- interrupt enabling status + @param[out] status 0 -- OFF \n + 1 -- ON + + @return + #QURT_EOK -- Success. \n + #QURT_EINT -- Failure; invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status); + +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_interrupt_clear + Clears the pending status of the specified interrupt. + + @note1hang This operation is intended for system-level use, and must be used with care. + + @param[in] int_num Interrupt that is being re-enabled. + + @return + #QURT_EOK -- Success.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_clear(int int_num); + + +/**@ingroup func_qurt_interrupt_get_config + Gets the L2VIC interrupt configuration. \n + This function returns the type and polarity of the specified L2VIC interrupt. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[out] int_type Pointer to an interrupt type. \n + 0 -- Level-triggered interrupt \n + 1 -- Eedge-triggered interrupt + @param[out] int_polarity Pointer to interrupt polarity.\n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt. + + @return + #QURT_EOK -- Configuration successfully returned.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity); + +/**@ingroup func_qurt_interrupt_set_config + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang Deregister L2VIC interrupts before reconfiguring them. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Interrupt type. \n + 0 -- Level-triggered interrupt\n + 1 -- Edge-triggered interrupt + @param[in] int_polarity Interrupt polarity. \n + 0 -- Active-high interrupt \n + 1 -- Active-low interrupt + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity); + +/**@ingroup func_qurt_interrupt_set_config2 + Sets the type and polarity of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured. + + @param[in] int_num L2VIC interrupt that is being re-enabled. + @param[in] int_type Notified to the hardware configuration callback function and used to + modify the L2VIC type. Possible values: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. + */ +unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type); + +/**@ingroup func_ qurt_interrupt_set_config3 + Sets the specified configuration value for the specified property of the specified L2VIC interrupt. + + @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity. + + @param[in] int_num L2VIC interrupt to re-enable. + @param[in] config_id Property to configure: \n + - #QURT_INT_CONFIGID_POLARITY \n + - #QURT_INT_CONFIGID_LOCK @tablebulletend + @param[in] config_val Dependent on the second argument config_id, specifies the value to set. \n + Values for #QURT_INT_CONFIGID_POLARITY: \n + - #QURT_INT_TRIGGER_USE_DEFAULT \n + - #QURT_INT_TRIGGER_LEVEL_HIGH \n + - #QURT_INT_TRIGGER_LEVEL_LOW \n + - #QURT_INT_TRIGGER_RISING_EDGE \n + - #QURT_INT_TRIGGER_FALLING_EDGE \n + - #QURT_INT_TRIGGER_DUAL_EDGE \n + + Values for #QURT_INT_CONFIGID_LOCK: \n + - #QURT_INT_LOCK_ENABLE\n + - #QURT_INT_LOCK_DISABLE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n + #QURT_EINT -- Invalid interrupt number. + + @dependencies + None. +*/ +unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val); + + +/**@ingroup func_qurt_interrupt_raise + Raises the interrupt. \n + This function triggers a level-triggered L2VIC + interrupt, and accepts interrupt numbers in the range of 0 to 1023. + + @param[in] interrupt_num Interrupt number. + + @return + #QURT_EOK -- Success \n + -1 -- Failure; the interrupt is not supported. + + @dependencies + None. + */ +int qurt_interrupt_raise(unsigned int interrupt_num); + +/**@ingroup func_qurt_interrupt_raise2 + Raises the interrupt and returns the current pcycle value. + + @param[in] interrupt_num Interrupt number. + + @return + 0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n + Other value -- pcycle count at the time the interrupt is raised. + + @dependencies + None. + */ +unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_isr_subcall + Indicates whether the current function is called from a callback procedure (either short or long). + + @return + #QURT_EOK -- TRUE \n + #QURT_EVAL -- FALSE. + + @dependencies + None. + */ +int qurt_isr_subcall(void); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_INT_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_island.h new file mode 100755 index 0000000000000..f0c8ee27cf8b0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_island.h @@ -0,0 +1,122 @@ +#ifndef QURT_ISLAND_H +#define QURT_ISLAND_H + +/** + @file qurt_island.h + @brief Prototypes of power API + The APIs allow entering and exiting island mode where the memory + accesses are limited to local memory. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + +=============================================================================*/ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_island_get_status + Gets Island mode status. + + Returns a value that indicates whether the QuRT system executes in Island mode. + + @return + 0 - Normal mode. \n + 1 - Island mode. + + @dependencies + None. +*/ +unsigned int qurt_island_get_status (void); + +/**@ingroup func_qurt_island_get_status2 + Gets Island mode status especially that differentiates between island partial exit and complete exit. + + Returns a value that indicates the current state. + + @note1hang Transition from NORMAL mode to ISLAND mode happens in single + threaded mode. Whereas transition from ISLAND mode to other modes + happen in multi-threaded mode. So, a thread that gets island mode + status as NORMAL can assume the same status till it continues to + run. A thread that gets island mode status as ISLAND should + assume that the status may change to EXITING or NORMAL while it + runs. A thread that gets island mode status as EXITING should + assume that the status may change to NORMAL while it runs. If + the thread goes to wait state in after reading the status, it should get + the island mode state again and not assume the previous state. + @note2hang This api returns more intrinsic states than qurt_island_get_status, + when qurt_island_get_status returns 0, this api could return + QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND + + @param[in/out] data field is reserved for future use. If NULL pointer is passed, + the field will be ignored. If a valid pointer is passed, + QuRT will return back a bitmask which can be interpreted as follows: + data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. + Otherwise set to 0. + data[30:0] – Reserved for future definition. + + @return + QURT_ISLAND_MODE_NORMAL - Main mode \n + QURT_ISLAND_MODE_ISLAND - Island mode \n + QURT_ISLAND_MODE_EXITING - Exiting Island mode \n + + @dependencies + None. +*/ +unsigned int qurt_island_get_status2 (unsigned int *data); + + + +/**@ingroup func_qurt_island_get_exit_status + Gets the reason for the last Island mode exit status. + + @param[out] cause_code Pointer that returns the cause code of the last + island exit reason. \n + - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n + - #QURT_ENOISLANDENTRY -- API called before exiting island. \n + - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend + + @param[out] int_num Pointer that holds the invalid interrupt number that caused + island exit when the cause code is #QURT_EISLANDINVALIDINT. + For other cases, it is -1. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num); + +/**@ingroup func_qurt_island_get_enter_timestamp + Gets the recent timestamp when the system exits STM during island enter. + + @param[out] island_enter_timestamp Returns a pointer to the recent timestamp + recorded after the system exits STM during island enter. If the system never + attempts to enter island, the island_enter_timestamp return pointer holds a value + of zero. + + @return + None. + + @dependencies + None. +*/ +void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISLAND_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_isr.h new file mode 100755 index 0000000000000..db29ea2f265d7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_isr.h @@ -0,0 +1,177 @@ +#ifndef QURT_ISR_H +#define QURT_ISR_H + +/*===================================================================== + + @file qurt_isr.h + + @brief Prototypes of Qurt ISR API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2017, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + Functions +=============================================================================*/ + + +/**@ingroup func_qurt_isr_set_hw_config_callback + Set callback function for the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_config_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_enable_callback + Set callback function for enabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_enable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_set_hw_disable_callback + Set callback function for disabling the configuration related to interrupt hardware. + In a process, the callback function can only be set once. + + @param[in] cb_addr address of the callback function. + + @return + #QURT_EOK -- the callback function is set succssfully. \n + #QURT_EFAILED -- Failure. The callback function has been set before. + + @dependencies + None. + */ +int qurt_isr_set_hw_disable_callback(unsigned int cb_addr); + + +/**@ingroup func_qurt_isr_create + Creates an ISR thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + + @return + #QURT_EVAL -- Invalid arguments + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr); + +/**@ingroup func_qurt_isr_register2 + Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes. + The interrupt is enabled when this function returns success. + + @datatypes + qurt_thread_t + + @param[in] isr_thread_id ISR thread ID, returned from qurt_isr_create() + @param[in] int_num The interrupt number + @param[in] prio Priority of the ISR + @param[in] flags Defines ACK type. Values : \n + QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine + in the Kernel. + QURT_INT_DELAYED_ACK - Client chooses to acknowledge. + @param[in] int_type. Notifies it to registered function. Values: \n + - QURT_INT_TRIGGER_USE_DEFAULT + - QURT_INT_TRIGGER_LEVEL_HIGH + - QURT_INT_TRIGGER_LEVEL_LOW + - QURT_INT_TRIGGER_RISING_EDGE + - QURT_INT_TRIGGER_FALLING_EDGE + - QURT_INT_TRIGGER_DUAL_EDGE + @param[in] isr Interrupt Service Routine with proto type void isr (void *arg, int int_num) + @param[in] arg 1st argument of the ISR when it is called to service the interrupt + + @return + QURT_EOK -- Successfully registered the ISR for the interrupt + QURT_EINT -- Interrupt not configured + QURT_EINVALID -- Invalid Thread ID + QURT_EDISABLED -- The feature is disabled + QURT_EDUPLICATE -- Interrupt is already registered + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg); + +/**@ingroup func_qurt_isr_deregister2 + De-registers the ISR for the specified interrupt. + The interrupt is disabled when this function returns success. + + @param[in] int_num The interrupt number + + @return + QURT_EOK -- ISR deregistered successfully + QURT_ENOREGISTERED -- Interrupt with int_num is not registered + + @dependencies + None. + */ +int qurt_isr_deregister2 (int int_num); + +/**@ingroup func_qurt_isr_delete + ISR thread will exit and releases Kernel resources + + @note1hang The ISR thread shouldn't be actively processing interrupts, + otherwise the call will fail and return an error. + + @param[in] thread-id of the ISR thread that needs to be deleted. + + @return + QURT_ENOTALLOWED -- ISR thread is processing an interrupt + QURT_EINVALID -- Invalid ISR thread ID + QURT_EOK -- Success + + @dependencies + Thread ID should be created using qurt_isr_create() + */ +int qurt_isr_delete (qurt_thread_t isr_tid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_ISR_H */ + + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_l2cfg.h new file mode 100755 index 0000000000000..7e26b30a580d9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_l2cfg.h @@ -0,0 +1,98 @@ +#ifndef QURT_L2CFG_H +#define QURT_L2CFG_H +/** + @file qurt_l2cfg.h + @brief QuRT APIs for L2 configuration and system configuration + +EXTERNAL FUNCTIONS + qurt_l2cfg_set + qurt_l2cfg_get + qurt_system_config_get + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + +/* Definition for system configuration */ +/** @addtogroup l2cfg_macros +@{ */ +#define QURT_CORE_CFG_HMX_INT8_SPATIAL 0x78 /**< HMX fixed-point spatial size */ +#define QURT_CORE_CFG_HMX_INT8_DEPTH 0x7C /**< HMX fixed-point output depth */ +/** @} */ /* end_addtogroup l2cfg_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_l2cfg_set + Sets the value of a L2 configuration register. A register can be set *IFF* its + initial value is configured. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[in] value Value to set the register to. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely + a configuration problem. \n + #QURT_EINVALID -- Argument error. \n + #QURT_ENOTALLOWED -- Setting this register is prohibited. + + @dependencies + None. + */ +int qurt_l2cfg_set (unsigned short offset, unsigned int value); + +/**@ingroup func_qurt_l2cfg_get + Gets the value of a L2 configuration register. + + @param[in] offset Offset of L2 configuration register; must be multiple of 4. + @param[out] value Pointer to value of the register. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; + likely a configuration problem. \n + #QURT_EINVALID -- Argument error. + + @dependencies + None. + + */ +int qurt_l2cfg_get (unsigned short offset, unsigned int * value); + + +/**@ingroup func_qurt_system_config_get + Gets the system configuration information. + + @param[in] index Index to system configuration. Values:\n + - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n + - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend + + @param[out] data Pointer to a word for returned data. + + @return + #QURT_EOK -- Get the configuration data successful. \n + Other values -- Failure (no such configuration available). + + @dependencies + None. + + */ +int qurt_system_config_get(int index, unsigned int *data); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_L2CFG_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_lifo.h new file mode 100755 index 0000000000000..dc399fccc5f0f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_lifo.h @@ -0,0 +1,71 @@ +#ifndef QURT_LIFO_H +#define QURT_LIFO_H +/** + @file qurt_lifo.h + + @brief + Provide lock free LastInFirstOut algorithm, which can be used in a + variety of situations for allocation/free fixed size buffer + This implementation touches the first word of your FREED buffer. Even + though it does not matter how you use it when it is allocated, you might want + to be a bit careful not to put your MAGIC number as the first field. + Because it will not hold the magic value for "freed" + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + /*===================================================================== + Functions + ======================================================================*/ + +/*======================================================================*/ +/** + Pops an element out of the LIFO. + + @param[in] freelist Pointer to the head of your list. + + @return + Top object from the list + + @dependencies + None. +*/ +/* ======================================================================*/ +void * qurt_lifo_pop(void *freelist); + + +/*======================================================================*/ +/** + Pushes an element into the LIFO. + + @param[in] freelist Pointer to the head of your list. + @param[in] buf Pointer to your buffer to push into the list. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_lifo_push(void *freelist, void *buf); + +void qurt_lifo_remove(void *freelist, void *buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_LIFO_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mailbox.h new file mode 100755 index 0000000000000..a6cd91c611782 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mailbox.h @@ -0,0 +1,176 @@ +#ifndef QURT_MAILBOX_H +#define QURT_MAILBOX_H + +/** + @file qurt_mailbox.h + @brief Definitions, macros, and prototypes used for QuRT mailbox + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* Definitions on typedef and return values */ + +#define QURT_MAILBOX_ID_NULL 0 +#define QURT_MAILBOX_ERROR -1 +#define QURT_MAILBOX_ID_ERROR -2 +#define QURT_MAILBOX_NON_VALID_DATA -3 +#define QURT_MAILBOX_FULL -4 +#define QURT_MAILBOX_DELETED -5 +#define QURT_MAILBOX_RECEIVE_HALTED -6 +#define QURT_MAILBOX_BANDWIDTH_LIMIT -7 + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ + +#define QURT_MAILBOX_AT_QURTOS 0U // Receiver is QurtOS +#define QURT_MAILBOX_AT_ROOTPD 1U // Receiver is RootPD (ASID=0) +#define QURT_MAILBOX_AT_USERPD 2U // Receiver is User PD (ASID!=0) +#define QURT_MAILBOX_AT_SECUREPD 3U // Receiver is Secure PD + +typedef unsigned char qurt_mailbox_receiver_cfg_t; + +#define QURT_MAILBOX_SEND_OVERWRITE 0U // When there is already valid content, overwrite it +#define QURT_MAILBOX_SEND_NON_OVERWRITE 1U // When there is already valid content, return failure + +typedef unsigned char qurt_mailbox_send_option_t; + + +#define QURT_MAILBOX_RECV_WAITING 0U // When there is no valid content, wait for it +#define QURT_MAILBOX_RECV_NON_WAITING 1U // When there is no valid content, return failure immediately +#define QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U // Read the content, but doesn't remove it from the mailbox. No waiting. + +typedef unsigned char qurt_mailbox_recv_option_t; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/* Function prototype */ + +/**@ingroup qurt_mailbox_create + Creates a QuRT mailbox. + + @param name Mailbox name up to 8 characters. + @param recv_opt Configuration on the receiver process. + + @return + Mailbox ID -- Mailbox Identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at creating mailbox + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt); + + +/**@ingroup qurt_mailbox_get_id + Gets a QuRT mailbox identifier. + + @param name Mailbox name up to 8 characters. + + @return + Mailbox ID -- Mailbox identifier \n + #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID + + @dependencies + None. +*/ +unsigned long long qurt_mailbox_get_id(char *name); + + +/**@ingroup qurt_mailbox_send + Sends data to a QuRT mailbox. + + @param mailbox_id Mailbox identifier. + @param send_opt Option for mailbox send. + @param data Data to send. + + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors.\n + #QURT_MAILBOX_FULL Valid data already exists, non-overwriting.\n + #QURT_MAILBOX_BANDWIDTH_LIMIT Reached the bandwidth limitation. + + @dependencies + None. +*/ +int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data); + + +/**@ingroup qurt_mailbox_receive + Receive data from QuRT mailbox + + @param mailbox_id Mailbox Identifier + @param send_opt Option for mailbox receiving + @param data Pointer to data buffer for receiving + + @return + #QURT_EOK Success \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. \n + #QURT_MAILBOX_NON_VALID_DATA No current valid data, put the previous content in the buffer. \n + #QURT_MAILBOX_RECEIVE_HALTED Receive halted, the waiting thread is woken up. \n + #QURT_MAILBOX_DELETED Mailbox is deleted, and the waiting thread is woken up. + + @dependencies + None. +*/ +int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data); + + +/**@ingroup qurt_mailbox_delete + Deletes a QuRT mailbox. + + A mailbox can only be deleted from the process that created the mailbox. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error. \n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_delete(unsigned long long mailbox_id); + + +/**@ingroup qurt_mailbox_receive_halt + Halts a QuRT mailbox receiving and wakes up waiting threads. + + @param mailbox_id Mailbox identifier. + + @return + #QURT_EOK Success. \n + #QURT_MAILBOX_ID_ERROR Mailbox ID error.\n + #QURT_MAILBOX_ERROR Other errors. + + @dependencies + None. +*/ +int qurt_mailbox_receive_halt(unsigned long long mailbox_id); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_MAILBOX_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_memory.h new file mode 100755 index 0000000000000..90ce2586fec50 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_memory.h @@ -0,0 +1,1487 @@ +#ifndef QURT_MEMORY_H +#define QURT_MEMORY_H +/** + @file qurt_memory.h + @brief Prototypes of kernel memory API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include +#include +//#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup memory_management_macros +@{ */ +#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all + processes.*/ +/** @} */ /* end_addtogroup memory_management_macros */ +/**@cond rest_reg_dist */ +/** @addtogroup memory_management_types +@{ */ +/** @xreflabel{hdr:qurt_mem_default_pool} */ +extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/ +/** @} */ /* end_addtogroup memory_management_types */ + +/** @cond rest_reg_dist */ +/** Mapping attribute information*/ +typedef struct{ + qurt_paddr_64_t paddr; + qurt_size_t size ; + qurt_mem_cache_mode_t cache_mode; + qurt_perm_t perms ; +}qurt_mapping_attr_t; +/** @endcond */ +/** @} */ /* end_addtogroup mapping_attribute_types*/ + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_mem_cache_clean + Performs a cache clean operation on the data stored in the specified memory area. + Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater. + + @note1hang Perform the flush all operation only on the data cache. + + @note1cont This operation flushes and invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed and invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_INVALIDATE + - #QURT_MEM_CACHE_FLUSH_ALL\n + @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend + @param[in] type Cache type. Values: + - #QURT_MEM_ICACHE + - #QURT_MEM_DCACHE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type.\n + + @dependencies + None. +*/ +int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_clean2 + Performs a data cache clean operation on the data stored in the specified memory area. + + This API only performs the following data cache operations:\n + - #QURT_MEM_CACHE_FLUSH\n + - #QURT_MEM_CACHE_INVALIDATE\n + - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address + to end address (start address + size). The contents of the adjoining buffer can be + flushed/invalidated if it falls in any of the cache line. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_op_t \n + #qurt_mem_cache_type_t + + @param[in] addr Address of data to flush. + @param[in] size Size (in bytes) of data to flush. + @param[in] opcode Type of cache clean operation. Values:\n #QURT_MEM_CACHE_FLUSH\n #QURT_MEM_CACHE_INVALIDATE\n + #QURT_MEM_CACHE_FLUSH_INVALIDATE + @param[in] type Cache type. Values: \n #QURT_MEM_DCACHE + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid cache type. + + @dependencies + None. +*/ +int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type); + +/**@ingroup func_qurt_mem_cache_phys_clean + Performs a cache clean operation on the data stored in the specified memory area based on address match and mask. + Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch. + + @note1hang The addrmatch value should be the upper 24-bit physical address to match against. + + @datatypes + #qurt_mem_cache_op_t \n + + @param[in] mask 24-bit address mask. + @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match. + @param[in] opcode Type of cache clean operation. Values: + - #QURT_MEM_CACHE_FLUSH + - #QURT_MEM_CACHE_INVALIDATE @tablebulletend + + @return + #QURT_EOK -- Cache operation performed successfully.\n + #QURT_EVAL -- Invalid operation + + @dependencies + None. +*/ + +int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode); + +/**@ingroup func_qurt_mem_l2cache_line_lock + Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory. + + @note1hang Perform the line lock operation only on the 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to lock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success.\n + #QURT_EALIGN -- Data alignment or address failure. + #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size) + #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address + in the range of addr and addr+size or the address range is not L2 cacheable + @dependencies + None. +*/ +int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_l2cache_line_unlock + Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory. + + @note1hang Perform the line unlock operation only on a 32-byte aligned size and address. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] addr Address of the L2 cache memory line to unlock; the address must be 32-byte aligned. + @param[in] size Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes. + + @return + #QURT_EOK -- Success. \n + #QURT_EALIGN -- Aligning data or address failure. \n + #QURT_EFAILED -- Operation failed, cannot find the matching tag. + + @dependencies + None. +*/ +int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size); + +/**@ingroup func_qurt_mem_region_attr_init + @xreflabel{sec:qurt_mem_region_attr_init} + Initializes the specified memory region attribute structure with default attribute values: \n + - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n + - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n + - Physical address -- -1 \n + - Virtual address -- -1 \n + - Memory type -- #QURT_MEM_REGION_LOCAL \n + - Size -- -1 + + @note1hang The memory physical address attribute must be explicitly set by calling the + qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly + as parameters in the memory region create operation. + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the destination structure for the memory region attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attach + Initializes a memory pool object to attach to a pool predefined in the system + configuration file. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. They are specified in memory region create operations + (Section @xref{sec:mem_region_create}). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach + operation is necessary only when allocating memory regions in nonstandard + memory units such as TCM. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_attach2 + Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL. + The client_handle is used to look up the client specific pool. + + Memory pool objects assign memory regions to physical memory in different + Hexagon memory units. Memory pool objects are specified during mapping creation operations + (qurt_mem_mmap() and qurt_mem_region_create()). + + @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2 + operation is necessary only when allocating memory regions in memory units specific to the client. + + @datatypes + #qurt_mem_pool_t + + @param[in] client_handle Client identifier used by the OS to lookup the identifier + for client specific pool + @param[in] name Pointer to the memory pool name. + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Attach operation successful. + + @dependencies + None. +*/ +int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_create + @xreflabel{hdr:qurt_mem_pool_create} + Dynamically creates a memory pool object from a physical address range. + + The pool is assigned a single memory region with the specified base address and size. + + The base address and size values passed to this function must be aligned to 4K byte + boundaries, and must be expressed as the actual base address and size values divided by 4K. + + For example, the function call: + @code + qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool) + @endcode + ... is equivalent to the following static pool definition in the QuRT system configuration file: + @code + + + + @endcode + + @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond + + @note1hang Dynamically created pools are not identical to static pools. In particular, + qurt_mem_pool_attr_get() is not valid with dynamically created pools. + + @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone. + + @datatypes + #qurt_mem_pool_t + + @param[in] name Pointer to the memory pool name. + @param[in] base Base address of the memory region (divided by 4K). + @param[in] size Size (in bytes) of the memory region (divided by 4K). + @param[out] pool Pointer to the memory pool object. + + @return + #QURT_EOK -- Success. + + @dependencies + None. +*/ +int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool); + +/**@ingroup func_qurt_mem_pool_add_pages + Adds a physical address range to the specified memory pool object.\n + + @note1hang Call this operation only with root privileges (guest OS mode). + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_mem_pool_add_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages); + +/**@ingroup func_qurt_mem_pool_remove_pages + Removes a physical address range from the specified memory pool object. + + If any part of the address range is in use, this operation returns an + error without changing the state. + + @note1hang Call this operation only with root privileges (guest-OS mode). + + @note1cont In the future, this operation will support (via the flags parameter) the + removal of a physical address range when part of the range is in use. + + @datatypes + #qurt_mem_pool_t + + @param[in] pool Memory pool object. + @param[in] first_pageno First page number of the physical address range (equivalent to address >> 12) + @param[in] size_in_pages Number of pages in the physical address range (equivalent to size >> 12) + @param[in] flags Remove options. Values: \n + - 0 -- Skip holes in the range that are not part of the pool (default) \n + - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified + physical address range is entirely contained (with no holes) in the + pool free space. @tablebulletend + @param[in] callback Callback procedure called when pages were successfully removed. + Not called if the operation failed. Passing 0 as the parameter + value causes the callback to not be called. + @param[in] arg Value passed as an argument to the callback procedure. + + @return + #QURT_EOK -- Pages successfully removed. + + @dependencies + None. +*/ +int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool, + unsigned first_pageno, + unsigned size_in_pages, + unsigned flags, + void (*callback)(void *), + void *arg); +/**@ingroup memory_management_types*/ +#define QURT_POOL_REMOVE_ALL_OR_NONE 1 /**< */ + +/**@ingroup func_qurt_mem_pool_attr_get + Gets the memory pool attributes. \n + Retrieves pool configurations based on the pool handle, and fills in + the attribute structure with configuration values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_attr_t + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[out] attr Pointer to the memory region attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr); + +/**@ingroup func_qurt_mem_pool_attr_get_size + Gets the size of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] size Pointer to the destination variable for the range size. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*size) = 0; + return QURT_EINVALID; + } + else { + (*size) = attr->ranges[range_id].size; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr + Gets the start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){ + if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; + } + else { + (*addr) = (attr->ranges[range_id].start)<<12; + } + return QURT_EOK; +} + +/**@ingroup func_qurt_mem_pool_attr_get_addr_64 + Gets the 64 bit start address of the specified memory pool range. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_addr_64_t + + @param[in] attr Pointer to the memory pool attribute structure. + @param[in] range_id Memory pool range key. + @param[out] addr Pointer to the destination variable for range start address. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Range is invalid. + + @dependencies + None. +*/ +static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){ +if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){ + (*addr) = 0; + return QURT_EINVALID; +} +else { + (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12; + } + return QURT_EOK; + } + + +/**@ingroup func_qurt_mem_pool_status_get + Gets the memory pool status. \n + Based on the pool handle, retrieves largest contiguous free memory, + total free memory, and total memory declared for the pool in bytes. Fills in + the memory status structure with the values. + + @datatypes + #qurt_mem_pool_t \n + #qurt_mem_pool_status_t + + @param[in] pool Pool handle. + @param[out] status Pointer to the memory pool status structure. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Corrupt handle; pool handle is invalid. +*/ +int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status); + + +/**@ingroup func_qurt_mem_pool_is_available + Checks whether the number of pages that the page_count argument indicates + can be allocated from the specified pool. + + @datatypes + #qurt_mem_pool_attr_t \n + #qurt_mem_mapping_t \n + + @param[in] pool Pool handle obtained from qurt_mem_pool_attach(). + @param[in] page_count Number of 4K pages. + @param[in] mapping_type Variable of type qurt_mem_mapping_t. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Mapping_type is invalid. \n + #QURT_EMEM -- Specified pages cannot be allocated from the pool. + + @dependencies + None. +*/ +int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type); + + +/**@ingroup func_qurt_mem_region_create + @xreflabel{sec:mem_region_create} + Creates a memory region with the specified attributes. + + The application initializes the memory region attribute structure with + qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr(). + + If the virtual address attribute is set to its default value + (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is + automatically assigned any available virtual address value. + + If the memory mapping attribute is set to virtual mapping, the physical address of the memory region + is also automatically assigned.\n + + @note1hang The physical address attribute is explicitly set in the attribute structure only + for memory regions with physical-contiguous-mapped mapping. + + Memory regions are always assigned to memory pools. The pool value specifies the memory pool + that the memory region is assigned to. + + @note1hang If attr is specified as NULL, the memory region is created with default + attribute values (Section @xref{sec:qurt_mem_region_attr_init}). + QuRT predefines the memory pool object #qurt_mem_default_pool + (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory. + + @datatypes + #qurt_mem_region_t \n + #qurt_size_t \n + #qurt_mem_pool_t \n + #qurt_mem_region_attr_t + + @param[out] region Pointer to the memory region object. + @param[in] size Memory region size (in bytes). If size is not an integral multiple of 4K, + it is rounded up to a 4K boundary. + @param[in] pool Memory pool of the region. + @param[in] attr Pointer to the memory region attribute structure. + + @return + #QURT_EOK -- Memory region successfully created.\n + #QURT_EMEM -- Not enough memory to create region. + #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute. + + @dependencies + None. +*/ +int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr); + +/**@ingroup func_qurt_mem_region_delete + Deletes the specified memory region. + + If the caller application creates the memory region, it is removed and the system reclaims its + assigned memory. + + If a different application creates the memory region (and is shared with the caller + application), only the local memory mapping to the region is removed; the system does + not reclaim the memory. + + @datatypes + #qurt_mem_region_t + + @param[in] region Memory region object. + + @returns + #QURT_EOK -- Region successfully deleted. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. +*/ +int qurt_mem_region_delete(qurt_mem_region_t region); + + +/**@ingroup func_qurt_mem_region_attr_get + @xreflabel{sec:mem_region_attr_get} + Gets the memory attributes of the specified message region. + After a memory region is created, its attributes cannot be changed. + + @datatypes + #qurt_mem_region_t \n + #qurt_mem_region_attr_t + + @param[in] region Memory region object. + @param[out] attr Pointer to the destination structure for memory region attributes. + + @return + #QURT_EOK -- Operation successfully performed. \n + Error code -- Failure. + + @dependencies + None. +*/ +int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr); + + +/**@ingroup func_qurt_mem_region_attr_set_type + Sets the memory type in the specified memory region attribute structure. + + The type indicates whether the memory region is local to an application or shared between + applications. + @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in,out] attr Pointer to memory region attribute structure. + @param[in] type Memory type. Values: \n + - #QURT_MEM_REGION_LOCAL \n + - #QURT_MEM_REGION_SHARED @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){ + attr->type = type; +} + +/**@ingroup func_qurt_mem_region_attr_get_size + Gets the memory region size from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_size_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] size Pointer to the destination variable for memory region size. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){ + (*size) = attr->size; +} + +/**@ingroup func_qurt_mem_region_attr_get_type + Gets the memory type from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_region_type_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] type Pointer to the destination variable for the memory type. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){ + (*type) = attr->type; +} + +/**@ingroup func_qurt_mem_region_attr_set_physaddr + Sets the memory region 32-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise QuRT automatically sets it + when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){ + attr->ppn = (unsigned)(((unsigned)(addr))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr + Gets the memory region physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for memory region physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_region_attr_set_virtaddr + Sets the memory region virtual address in the specified memory attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_addr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr Memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){ + attr->virtaddr = addr; +} + +/**@ingroup func_qurt_mem_region_attr_get_virtaddr + Gets the memory region virtual address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr Pointer to the destination variable for the memory region virtual address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){ + (*addr) = (unsigned int)(attr->virtaddr); +} + +/**@ingroup func_qurt_mem_region_attr_set_mapping + Sets the memory mapping in the specified memory region attribute structure. + + The mapping value indicates how the memory region is mapped in virtual memory. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mapping Mapping. Values: + - #QURT_MEM_MAPPING_VIRTUAL + - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS + - #QURT_MEM_MAPPING_IDEMPOTENT + - #QURT_MEM_MAPPING_VIRTUAL_FIXED + - #QURT_MEM_MAPPING_NONE + - #QURT_MEM_MAPPING_VIRTUAL_RANDOM + - #QURT_MEM_MAPPING_INVALID @tablebulletend + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){ + attr->mapping_type = mapping; +} + +/**@ingroup func_qurt_mem_region_attr_get_mapping + Gets the memory mapping from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_mapping_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mapping Pointer to the destination variable for memory mapping. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){ + (*mapping) = attr->mapping_type; +} + +/**@ingroup func_qurt_mem_region_attr_set_cache_mode + Sets the cache operation mode in the specified memory region attribute structure. + + @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] mode Cache mode. Values: \n + - #QURT_MEM_CACHE_WRITEBACK \n + - #QURT_MEM_CACHE_WRITETHROUGH\n + - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n + - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n + - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n + - #QURT_MEM_CACHE_NONE @tablebulletend + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){ + QURT_PGATTR_C_SET(attr->pga, (unsigned)mode); +} + +/**@ingroup func_qurt_mem_region_attr_get_cache_mode + Gets the cache operation mode from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){ + unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga); + (*mode) = (qurt_mem_cache_mode_t)mode_temp; +} + +/**@ingroup func_qurt_mem_region_attr_set_bus_attr + Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure. + + @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond + + @datatypes + #qurt_mem_region_attr_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] abits The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){ + QURT_PGATTR_A_SET(attr->pga, abits); +} + +/**@ingroup func_qurt_mem_region_attr_get_bus_attr + Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] pbits Pointer to an unsigned integer that is filled in with + the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){ + (*pbits) = QURT_PGATTR_A_GET(attr->pga); +} + +void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle); +void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle); +void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms); +void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms); + +/**@ingroup func_qurt_mem_map_static_query + Determines whether a memory page is statically mapped. + Pages are specified by the following attributes: physical address, page size, cache mode, + and memory permissions. \n + - If the specified page is statically mapped, vaddr returns the virtual + address of the page. \n + - If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + The system configuration file defines QuRT memory maps. + + @datatypes + #qurt_addr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr Physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n + #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + + +/**@ingroup func_qurt_mem_region_query + Queries a memory region. \n + This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. + When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_paddr_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr Physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Query successfully performed. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr); + + +/**@ingroup func_qurt_mapping_create + @xreflabel{hdr:qurt_mapping_create} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Mapping created. \n + #QURT_EMEM -- Failed to create mapping. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove + @xreflabel{hdr:qurt_mapping_remove} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr Physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Mapping created. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr + Translates a virtual memory address to the physical memory address to which it maps. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the + physical address of another process. + + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- Physical address to which the virtual address is mapped.\n + 0 -- Virtual address not mapped. + + @dependencies + None. +*/ +qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr); + +/**@ingroup func_qurt_mem_region_attr_set_physaddr_64 + Sets the memory region 64-bit physical address in the specified memory attribute structure. + + @note1hang The physical address attribute is explicitly set only for memory regions with + physical contiguous mapping. Otherwise it is automatically set by + QuRT when the memory region is created. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in,out] attr Pointer to the memory region attribute structure. + @param[in] addr_64 Memory region 64-bit physical address. + + @return + None. + */ +static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){ + attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12); +} + +/**@ingroup func_qurt_mem_region_attr_get_physaddr_64 + Gets the memory region 64-bit physical address from the specified memory region attribute structure. + + @datatypes + #qurt_mem_region_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory region attribute structure. + @param[out] addr_64 Pointer to the destination variable for the memory region 64-bit physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){ + (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12); +} + +/**@ingroup func_qurt_mem_map_static_query_64 + Determines if a memory page is statically mapped. + The following attributes specify pages: 64-bit physical address, page size, cache mode, + and memory permissions. \n + If the specified page is statically mapped, vaddr returns the virtual + address of the page. + If the page is not statically mapped (or if it does not exist as specified), vaddr + returns -1 as the virtual address value.\n + QuRT memory maps are defined in the system configuration file. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] vaddr Virtual address corresponding to paddr. + @param[in] paddr_64 64-bit physical address. + @param[in] page_size Size of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n + #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n + #QURT_EVAL -- Specified page does not exist. + + @dependencies + None. + */ +int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mem_region_query_64 + Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the + specified virtual or physical address. When a memory region has been determined to exist, its attributes are + accessible (Section @xref{sec:mem_region_attr_get}). + + @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both + vaddr and paddr (or to neither). + + @datatypes + #qurt_mem_region_t \n + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[out] region_handle Pointer to the memory region object (if it exists). + @param[in] vaddr Virtual address to query; if vaddr is specified, paddr must be set to + the value #QURT_EINVALID. + @param[in] paddr_64 64-bit physical address to query; if paddr is specified, vaddr must be set to + the value #QURT_EINVALID. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Region not found for the specified address. \n + #QURT_EFATAL -- Invalid input parameters. + + @dependencies + None. + */ +int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64); + +/**@ingroup func_qurt_mapping_create_64 + @xreflabel{hdr:qurt_mapping_create_64} + Creates a memory mapping in the page table. + Not supported if called from a user process, always returns QURT_EMEM. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size (4K-aligned) of the mapped memory page. + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perm Access permissions. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Failure. + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + @dependencies + None. +*/ +int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, + qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm); + +/**@ingroup func_qurt_mapping_remove_64 + @xreflabel{hdr:qurt_mapping_remove_64} + Deletes the specified memory mapping from the page table. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t \n + #qurt_size_t + + @param[in] vaddr Virtual address. + @param[in] paddr_64 64-bit physical address. + @param[in] size Size of the mapped memory page (4K-aligned). + + @return + #QURT_EOK -- Success. + #QURT_ELOCKED -- Buffer is locked. Mapping delete failed. + + @dependencies + None. + + */ +int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size); + +/**@ingroup func_qurt_lookup_physaddr_64 + Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n + The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical + address of another process. + + @datatypes + #qurt_paddr_64_t \n + #qurt_addr_t + + @param[in] vaddr Virtual address. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address has not been mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr); +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_mapping_reclaim + Deallocates all QuRT resources associated with the specified virtual + memory area, making it available for user memory management:\n + - The associated physical memory areas are freed and added to the + specified physical pool.\n + - The associated TLB entries are deleted and made available for TLB + management.\n + - The virtual memory area is not freed -- it is left in + place as allocated, but unmapped virtual memory. Access to this + memory area generates an exception.\n + + The virtual memory area must be statically allocated. + If no pool is specified, the freed physical memory is not added to any pool. + + @note1hang The virtual memory area is restricted to being filled with locked + TLB entries that are contiguous within the memory area, and contained by it. + + @datatypes + #qurt_addr_t \n + #qurt_size_t \n + #qurt_mem_pool_t + + @param[in] vaddr Virtual address of the memory area to free. + @param[in] vsize Size (in bytes) of the memory area to free. + @param[in] pool Handle to the physical pool where freed physical memory is added. + If set to 0, freed physical memory is not added to any pool. + + @return + 0 -- Success. \n + Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that + QuRT logs messages related to the failure, and callers are free to ignore the return value. + + @dependencies + None. +*/ +int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_mem_configure_cache_partition + Configures the Hexagon cache partition at the system level. + + A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache. + + The L1 cache partition is not supported in Hexagon processor version V60 or greater. + + @note1hang Call this operation only with QuRT OS privilege. + + @datatypes + #qurt_cache_type_t \n + #qurt_cache_partition_size_t + + @param[in] cache_type Cache type for partition configuration. Values: \n + - #HEXAGON_L1_I_CACHE \n + - #HEXAGON_L1_D_CACHE \n + - #HEXAGON_L2_CACHE @tablebulletend + + @param[in] partition_size Cache partition size. Values: \n + - #FULL_SIZE \n + - #HALF_SIZE \n + - #THREE_QUARTER_SIZE \n + - #SEVEN_EIGHTHS_SIZE @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Error. + + @dependencies + None. + */ +int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size); + + +/**@ingroup func_qurt_mem_syncht + @xreflabel{hdr:qurt_mem_syncht} + Performs heavy-weight synchronization of memory transactions. + + This operation does not return until all previous memory transactions (cached and uncached load/store, + mem_locked, and so on) that originated from the current thread are complete and globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mem_syncht(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" SYNCHT \n"); + #endif +} + +/**@ingroup func_qurt_mem_barrier + @xreflabel{hdr:qurt_mem_barrier} + Creates a barrier for memory transactions. + + This operation ensures that all previous memory transactions are globally observable before any + future memory transactions are globally observable. + + @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction. + @return + None + + @dependencies + None. + */ +static inline void qurt_mem_barrier(void){ + #ifdef __HEXAGON_ARCH__ + __asm__ __volatile__ (" BARRIER \n"); + #endif +} +/** @endcond */ + +/** @cond internal_only */ +/**@ingroup func_qurt_system_mem_alloc + Requests that the kernel allocates memory from the kernel-owned pool. + + @param[in] size Size in bytes (aligned to 4K) to allocate. + @param[in] align Any alignment that must be considered for the allocation. + @param[in] flags Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates + available virtual memory in the address space of all processes. + + @return + #QURT_EFATAL -- Allocation failed \n + Start address of the successful allocation. + + @dependencies + None. +*/ +unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags); +/** @endcond */ +/** @cond rest_reg_dist*/ +/**@ingroup func_qurt_lookup_physaddr2 + Translates the virtual memory address of the specified process to the 64-bit + physical memory address to which it is mapped. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_64_t + + @param[in] vaddr Virtual address. + @param[in] pid PID. + + @return + Nonzero -- 64-bit physical address to which the virtual address is mapped. \n + 0 -- Virtual address is not mapped. + + @dependencies + None. +*/ +qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid); +/** @endcond */ + +/**@ingroup func_qurt_mapping_attr_get + Gets the mapping attributes for a given virtual address and PID + + @datatypes + #qurt_addr_t \n + #qurt_mapping_attr_t + + @param[in] vaddr virtual address for which the attributes are required. + @param[in] pid process id for the target process + @param[out] attr Pointer to the mapping attribute structure. + + @return + 0 -- Success. \n + #QURT_EINVALID -- Incorrect virtual address or pid +*/ +int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr); + + +/**@ingroup func_qurt_mapping_attr_get_cache_mode + Gets the cache operation mode in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_mem_cache_mode_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] cache_mode Pointer to the destination variable for cache mode. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode) +{ + (*cache_mode) = attr->cache_mode; +} + +/**@ingroup func_qurt_mapping_attr_get_physaddr + Gets the physical memory address in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_paddr_64_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] physaddr Pointer to the destination variable for physical address. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr) +{ + (*physaddr) = attr->paddr; +} + +/**@ingroup func_qurt_mapping_attr_get_perms + Gets the permissions in the specified memory mapping attribute structure. + + + @datatypes + #qurt_mapping_attr_t \n + #qurt_perm_t + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] perms Pointer to the destination variable for permissions. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms) +{ + (*perms) = attr->perms; +} + +/**@ingroup func_qurt_mapping_attr_get_size + Gets the size in the specified memory mapping attribute structure.This represents size of the + TLB entry which covers the virtual address. + + + @datatypes + #qurt_mapping_attr_t \n + #unsigned int + + @param[in] attr Pointer to the memory mapping attribute structure. + @param[out] size Pointer to the destination variable for size. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size) +{ + (*size) = attr->size; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MEMORY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mmap.h new file mode 100755 index 0000000000000..c3bd875910af7 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mmap.h @@ -0,0 +1,359 @@ +#ifndef QURT_MMAP_H +#define QURT_MMAP_H +/** + @file qurt_mmap.h + @brief Prototypes of memory mapping/unmapping APIs. + The APIs allow the user to map, un-map, and change permissions + on memory regions. + + EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_mem_mmap + Creates a memory mapping with the specified attributes. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that specifies a pool handle + if the user wants to allocate memory from a specific pool. + The default value for this argument is NULL. + @param[in] pRegion Map region. This argument is unused, and the default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + @param[in] flags Mapping modes.\n + - #QURT_MAP_NAMED_MEMSECTION + - #QURT_MAP_FIXED \n + - #QURT_MAP_NONPROCESS_VPOOL \n + - #QURT_MAP_TRYFIXED \n + - #QURT_MAP_ANON \n + - #QURT_MAP_PHYSADDR \n + - #QURT_MAP_VA_ONLY @tablebulletend + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap2 + Creates a memory mapping with the specified attributes. Returns a more descriptive + error code in case of failure. + This API allows the root process caller to create mapping on behalf of a user + process. If the client_handle belongs to a valid user process, the resulting + mapping is created for the process. + If -1 is passed in place of client_handle, the API creates mapping + for the underlying process of the caller. + + @note1hang If the specified attributes are not valid, an error result is returned. + + @param[out] client_handle Client handle to use for this mapping (optional). + @param[in] pool Optional argument that allows the user to specify a pool handle + when the user wants to allocate memory from a specific pool. + Default value for this argument is NULL. + @param[in] pRegion Map region (unused argument); default value is NULL. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode. + @param[in] flags Mapping modes; + Shared, Private, or Anonymous. + @param[in] fd File designator. + @param[in] offset Offset in file. + + @return + Valid virtual address -- Success.\n + #QURT_EMEM -- Physical address is not available. \n + #QURT_EFAILED -- VA is not available or mapping failed.\n + #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA). + */ +void *qurt_mem_mmap2(int client_handle, + qurt_mem_pool_t pool, + qurt_mem_region_t *pRegion, + void *addr, + size_t length, + int prot, + int flags, + int fd, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mmap_by_name + Creates a memory mapping for a named-memsection using the specified attributes. + The named memsection should be specified in cust_config.xml. + + @note1hang If the specified attributes are not valid or the named memsection is not found, + an error result is returned. + + @param[in] name Name of the memsection in cust_config.xml that specifies + this mapping. Should be less than 25 characters. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, bus attributes, User mode + @param[in] flags Mapping modes, such as + Shared, Private, or Anonymous. + @param[in] offset Offset relative to the physical address range specified in memsection. + If offset + length exceeds size of memsection, failure is + returned. + @return + Valid virtual address -- Success.\n + #QURT_MAP_FAILED -- Mapping creation failed. + */ +void *qurt_mem_mmap_by_name(const char* name, + void *addr, + size_t length, + int prot, + int flags, + unsigned long long offset); + +/**@ingroup func_qurt_mem_mprotect2 + Changes access permissions and attributes on an existing mapping based on the client_handle argument. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned. + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping.\n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect2(int client_handle, const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_mprotect + Changes access permissions and attributes on an existing mapping. + + @note1hang If the specified virtual address is not found or invalid attributes are passed, + an error code is returned.\n + + @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the + mapping, while for the remaining it is unchanged. Clients should not use these mappings further. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] prot Mapping access permissions (R/W/X). + Cache attributes, Bus attributes, User mode. + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. \n + #QURT_EINVALID -- Attributes / permissions requested are invalid. + */ +int qurt_mem_mprotect(const void *addr, + size_t length, + int prot); + +/**@ingroup func_qurt_mem_munmap + Removes an existing mapping. + + @note1hang If the specified mapping is not found in the context of the caller process + or invalid attributes are passed, an error code is returned. + + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap(void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap2 + Removes an existing mapping for a specified process. + + @note1hang This API allows a root process entity, such as a driver, to remove mapping + that was created for a user process. If the specified mapping is not found in the context + of client handle or invalid attributes are passed, an error code is returned. + + @param[out] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Virtual memory address. + @param[in] length Size of mapping in bytes. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap2(int client_handle, + void *addr, + size_t length); + +/**@ingroup func_qurt_mem_munmap3 + Removes an existing mapping or reservation for a specified process. + + @param[in] client_handle Client handle of the user process that owns this mapping. + @param[in] addr Pointer to a virtual memory address. + @param[in] length Size of mapping in bytes. + @param[in] flags Specifies the flag. + + @return + #QURT_EOK -- Successfully changes permissions on the mapping. \n + #QURT_EFATAL -- Failed to change permissions on the mapping. + #QURT_ELOCKED - Buffer is locked. Mapping delete failed. + */ +int qurt_mem_munmap3(int client_handle, + void *addr, + size_t length, + int flags); + +/* +|| The macros here follow the style of the standard mmap() macros, but with +|| QURT_ prepended to avoid name conflicts, and to avoid having a dependency +|| on sys/mman.h. +|| +|| Wherever possible, any values here that are also present in sys/mman.h +|| should have the same value in both places so that we can accept "mmap" +|| calls without having to remap parameters to new values. +|| +|| In the future, it would be desirable to have a regression test that +|| checks, for instance, that these macros match. Example: +|| +|| assert(QURT_MAP_FAILED == MAP_FAILED); +|| ... repeat as needed ... +*/ + +/** @addtogroup memory_mapping_macros +@{ */ +/** @cond */ +#define QURT_PROT_NONE 0x00U /**< */ +#define QURT_PROT_READ 0x01U /**< */ +#define QURT_PROT_WRITE 0x02U /**< */ +#define QURT_PROT_EXEC 0x04U /**< */ +#define QURT_PROT_NODUMP 0x08U /**< Skip dumping the mapping. During PD dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and the DSP process + crashes before the mapping is removed.*/ +#define QURT_PROT_ISLAND 0x10U /**< Island mapping. */ + +#define QURT_MAP_SHARED 0x0001U /**< Shared. */ +#define QURT_MAP_PRIVATE 0x0002U /**< Private. */ +/** @endcond */ +#define QURT_MAP_NAMED_MEMSECTION 0x0004U /**< Named memsection. */ +#define QURT_MAP_FIXED 0x0010U /**< Fixed virtual address. */ +#define QURT_MAP_RENAME 0x0020U /**< Rename. */ +#define QURT_MAP_NORESERVE 0x0040U /**< No reserve. */ +#define QURT_MAP_INHERIT 0x0080U /**< Inherit. */ +#define QURT_MAP_NONPROCESS_VPOOL 0x0100U /**< Use a virtual address outside of the default range of the + processes. This option is only supported in the root process + and only when virtual memory split is enabled in the XML. + The root process can use this flag to create mapping for a + user process, for example, if the virtual address is configured + for a 3G/1G split, the root process can use this flag to create + mapping in the top 1 GB area for the user process or the + lower 3 GB area for the root process. This is useful for + shared buffer use cases. */ +#define QURT_MAP_HASSEMAPHORE 0x0200U /**< Has semaphore. */ +#define QURT_MAP_TRYFIXED 0x0400U /**< Try to create a mapping for a virtual address that was passed. + If the passed virtual address fails, use a random virtual address. */ +#define QURT_MAP_WIRED 0x0800U /**< Wired. */ +#define QURT_MAP_FILE 0x0000U /**< File. */ +#define QURT_MAP_ANON 0x1000U /**< Allocate physical memory from the pool that was passed. + By default, memory is allocated from the default physpool. */ +#define QURT_MAP_VA_ONLY 0X2000U /**< Reserve a virtual address without + mapping it. */ + +/** @cond */ +#define QURT_MAP_ALIGNED(n) ((n) << QURT_MAP_ALIGNMENT_SHIFT) +#define QURT_MAP_ALIGNMENT_SHIFT 24 + + +#define QURT_MAP_ALIGNMENT_MASK QURT_MAP_ALIGNED(0xff) /**< */ +#define QURT_MAP_ALIGNMENT_64KB QURT_MAP_ALIGNED(16) /**< */ +#define QURT_MAP_ALIGNMENT_16MB QURT_MAP_ALIGNED(24) /**< */ +#define QURT_MAP_ALIGNMENT_4GB QURT_MAP_ALIGNED(32) /**< */ +#define QURT_MAP_ALIGNMENT_1TB QURT_MAP_ALIGNED(40) /**< */ +#define QURT_MAP_ALIGNMENT_256TB QURT_MAP_ALIGNED(48) /**< */ +#define QURT_MAP_ALIGNMENT_64PB QURT_MAP_ALIGNED(56) /**< */ +/** @endcond */ +#define QURT_MAP_FAILED ((void *) -1) /**< Mapping creation failed. */ + +/* +|| The macros below are extensions beyond the standard mmap flags, but follow +|| the style of the mmap flags. +*/ +/** @cond */ +// Describe bitfields in (prot) +#define QURT_PROT_CACHE_BOUNDS 16U,19U,7U /**< Bits 16 through 19 are cache attribute, default is 0. */ +#define QURT_PROT_BUS_BOUNDS 20U,21U,0U /**< Bits 20 through 21 are bus attributes, default is 0. */ +#define QURT_PROT_USER_BOUNDS 22U,23U,3U /**< Bits 22 through 23 are user mode, default is 3; + default of 3 means to derive user mode setting from the + default mode of the client. */ + +// Describe bitfields in (flags) +#define QURT_MAP_PHYSADDR_BOUNDS 15U,15U,0U /**< Bits 15 through 15 are physaddr, default is 0. */ +#define QURT_MAP_TYPE_BOUNDS 16U,19U,0U /**< Bits 16 through 19 are mapping type, default is 0. */ +#define QURT_MAP_REGION_BOUNDS 20U,23U,0U /**< Bits 20 through 23 are region type, default is 0. */ +/** @endcond */ + +// These macros get OR'ed into (prot) +#define QURT_PROT_CACHE_MODE(n) QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_BUS_ATTR(n) QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_USER_MODE(n) QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n)) /**< */ +// These macros get OR'ed into (flags) + +#define QURT_MAP_PHYSADDR QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. + This is allowed only for root process. */ +#define QURT_MAP_TYPE(n) QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_REGION(n) QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n)) /**< */ +/** @} */ /* end_addtogroup memory_mapping_macros */ +/** @cond */ +// These macros extract fields from (prot) +#define QURT_PROT_GET_CACHE_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_BUS_ATTR(n) QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n)) /**< */ +#define QURT_PROT_GET_USER_MODE(n) QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n)) /**< */ + +// These macros extract fields from (flags) +#define QURT_MAP_GET_TYPE(n) QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n)) /**< */ +#define QURT_MAP_GET_REGION(n) QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */ + +// Macros for bitfield insertion and extraction +#define QURT_MMAP_MASK(lo,hi) (~((~0u) << ((hi)-(lo)+1U))) /**< Mask of same size as [lo..hi]. */ +#define QURT_MMAP_BUILD_(lo,hi,def,n) ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */ +#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */ +#define QURT_MMAP_BUILD(a,b) QURT_MMAP_BUILD_(a,b) /**< */ +#define QURT_MMAP_EXTRACT(a,b) QURT_MMAP_EXTRACT_(a,b) /**< */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mq.h new file mode 100755 index 0000000000000..580c83d3de41a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mq.h @@ -0,0 +1,458 @@ +#ifndef QURT_MQ_H +#define QURT_MQ_H +/** + @file qurt_mq.h + + @brief Prototypes of secure message queues API functions. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2019-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. +======================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_MQ_NAME_MAXLEN 16U /**< Maximum name length. */ + + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/* This enum must be generated in accordance to process class class numbers. + For now it is made to match generated version, do not change this unless + there is a corresponding change in the process_class.py, indicies start from 0 + basically: QURT_MQ_SECURITY_SCOPE_ = (1 << QURTK_process_class_index_) +*/ +typedef enum { + QURT_MQ_SECURITY_SCOPE_KERNEL = ( 1U << 0 ), + QURT_MQ_SECURITY_SCOPE_SRM = ( 1U << 1 ), + QURT_MQ_SECURITY_SCOPE_SECURE = ( 1U << 2 ), + QURT_MQ_SECURITY_SCOPE_CPZ = ( 1U << 3 ), + QURT_MQ_SECURITY_SCOPE_ROOT = ( 1U << 4 ), + QURT_MQ_SECURITY_SCOPE_SIGNED = ( 1U << 5 ), + QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ), + QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 ) +} qurt_mq_security_scope_t; + +typedef enum { + QURT_MQ_CARDINALITY_PTP = (1U << 0), + QURT_MQ_CARDINALITY_MTO = (1U << 1) +}qurt_mq_cardinality_t; + +typedef unsigned int qurt_mqd_t; + +typedef union{ + struct { + unsigned int perms:2; + unsigned int cardinality:1; + unsigned int blocking:1; + + qurt_mq_security_scope_t creator_scope: 8; + qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO + unsigned int queue_closed: 1; + unsigned int reserved: 11; + }; //try to do anonymous struct + unsigned int raw; +} qurt_mq_flags_t; + + +/* permissions are from qurt_types.h , block X though */ +#if 0 +/** Memory access permission. */ +typedef enum { + QURT_PERM_READ=0x1U, /**< */ + QURT_PERM_WRITE=0x2U, /**< */ + QURT_PERM_EXECUTE=0x4U, /**< */ + QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE, /**< */ +} qurt_perm_t; +#endif + +struct qurt_mq_attr { + unsigned flags; /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */ + unsigned mq_maxmsg; /**< Maximum number of messages. Used with create() and get_attr. */ + unsigned short mq_send_msgsize; /**< Maximum size (bytes) of message in receiver facing queue, + from sender to receiver. */ + unsigned short mq_recv_msgsize; /**< Maximum size (bytes) of message in sender facing queue, + from receiver to sender. */ + unsigned client_pid; /**< Process ID of client that is allowed to open the message queue + that was created using qurt_mq_create(). */ + qurt_mq_cardinality_t cardinality; /**< Cardinality of message queue connection, see below. */ + qurt_mq_security_scope_t scope; /**< Security scope of the senders to the queue. */ +}; + + +/*============================================================================= + EXTERNS & FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mq_attr_init + Initializes attributes to default values used for creating the queue. + + The initialize operation sets the following default attribute values: \n + - flag - QURT_PERM_READ | QURT_PERM_WRITE \n + - maxmsg - 1 \n + - mq_send_msgsize - 8 \n + - mq_recv_msgsize - 8 \n + - sender_pid - -1 \n + - cardinality - QURT_MQ_CARDINALITY_PTP \n + - scope - QURT_MQ_SECURITY_SCOPE_SIGNED \n + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the initialized message queue object. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_init(struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_attr_set_send_msgsize + Sets the message size in bytes the sender can send. + Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_recv_msgsize + Sets the message size in bytes that the receiver can read. + Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] len Length of message in bytes. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len); + +/**@ingroup qurt_mq_attr_set_maxmsg + Sets the maximum message that can queue in the message queue. + Message depth is configurable using the XML configuration. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] depth Maximum message that can be queued. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth); + +/**@ingroup qurt_mq_attr_set_scope + Sets the scope of the message queue. A message queue created with a security + scope allows only a process class of that scope to open a message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mq_security_scope_t + + @param[in,out] attr Pointer to the message queue object. + @param[in] scope Scope of the message queue: \n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope); + + +/**@ingroup qurt_mq_attr_set_client_pid + Sets the client_pid that can open this message queue. + If client_pid is set, allowed_scope to open MQ shall not be considered. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] client_pid Valid PID for client process. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid); + +/**@ingroup qurt_mq_attr_set_flags + Sets the properties of the message queues. + The current implementation is only used to set the permission for the message queue using the flag attribute. + Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented. + + @datatypes + #qurt_mq_attr + + @param[in,out] attr Pointer to the message queue object. + @param[in] flags Permission for message queue. + + @return + None. + + @dependencies + None. +*/ +void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags); + +/**@ingroup qurt_mq_create + Create a message queue with the provided name and attributes. + The calling process becomes the owner of the queue. + Name of the message queue is limited to 16 characters including the NULL terminator. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue identifier if + the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] attr Pointer to the initialized message queue attribute + structure that specifies the attributes of the created message queue. + + @return + #QURT_EOK Message queue created. \n + #QURT_EINVALID Invalid arguments. \n + #QURT_ENOSPC Maximum number of queues in the system is exceeded. + + @dependencies + None. +*/ +int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr); + +/**@ingroup qurt_mq_open + Opens a message queue connection between a process and a created message queue. + + @datatypes + #qurt_mq_attr \n + #qurt_mqd_t + + @param[out] mqd Returns a pointer to the message queue + identifier if the message queue was successfully created. + @param[in] name String identifier of the message queue. + @param[in] flags Flag that contains the properties that define the behavior of message queue connection. + Permissions:\n + #QURT_PERM_READ \n + #QURT_PERM_WRITE \n + #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend + Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n + Cardinality: \n + #QURT_MQ_CARDINALITY_PTP (default) \n + #QURT_MQ_CARDINALITY_MTO (not implemented) \n + Block suspend thread until the message queue with the apecified name is created. \n + Scope: security boundary to which the message queue and its users are constrained. + Block suspend thread until the message queue with the apecified name is created. \n + It is coupled with process privilege level/scope.\n + #QURT_MQ_SECURITY_SCOPE_KERNEL \n + #QURT_MQ_SECURITY_SCOPE_SRM \n + #QURT_MQ_SECURITY_SCOPE_SECURE \n + #QURT_MQ_SECURITY_SCOPE_CPZ \n + #QURT_MQ_SECURITY_SCOPE_ROOT \n + #QURT_MQ_SECURITY_SCOPE_SIGNED \n + #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend + + @return + QURT_EOK -- Message queue connection successfully opened \n + QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n + QURT_ENOTALLOWED -- Open failed due to security scope mismatch + + @dependencies + None. +*/ +int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags); + +/**@ingroup qurt_mq_send + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send shall resume that thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] msg_len Length of the message buffer in bytes. + + @return + #QURT_EOK Message queue send was successful.\n + #QURT_EMSGSIZE Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED Send failed due to security scope mismatch. + + @dependencies + None. +*/ +int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); + +/**@ingroup qurt_mq_send_timed + Sends a message over message queue.\n + - If the message queue is full, the calling thread shall be + suspended until space becomes available to enqueue the message or until timeout is reached. \n + - If there exists a thread suspended on an empty queue + to receive a message, qurt_mq_send_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer. + @param[in] duration Interval (in microseconds) that the duration value must be + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in] msg_len Length of message buffer in bytes. + + @return + #QURT_EOK -- Message queue send was successful. \n + #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n + #QURT_ENOTALLOWED -- Send failed due to security scope mismatch \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. +*/ +int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len); + + /**@ingroup qurt_mq_recv + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue. \n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv shall resume the thread. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in,out] msg_len Pointer to the length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID Message pointer or msg_len ptr are NULL. \n + #QURT_EBADR Message queue descriptior (mqd) is invalid. \n + #QURT_EBADF Sender closed the message queue. + + @dependencies + None. +*/ +int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len); + + /**@ingroup qurt_mq_recv_timed + Receives a message from the message queue. \n + -If the message queue is empty, the calling thread shall be + suspended until a message is enqueued in the message queue or until timeout is reached.\n + -If there exists a thread suspended on a full queue to + send a message, qurt_mq_recv_timed shall return with possible return codes.\n + - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + @param[in] msg_ptr Pointer to the message buffer + @param[in] duration Interval (in microseconds) that the duration value must be; + between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION + @param[in,out] msg_len Pointer to length of message buffer. + + @return + #QURT_EOK -- Message queue created.\n + #QURT_EINVALID -- Message ptr or msg_len ptr are NULL. \n + #QURT_EBADR -- Message queue descriptior (mqd) is invalid.\n + #QURT_EBADF -- Sender closed the message queue. \n + #QURT_ETIMEDOUT -- Timeout. + + @dependencies + None. +*/ +int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len); + + /**@ingroup qurt_mq_close + Closes the message queue and disassociates the calling process (client) from the message queue + under this descriptor. Marks the queue as closed for the receiver. + This function is expected to be called from the client side. If called + from the server side, the function reduces to no-op and returns success. + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue close was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_close(qurt_mqd_t mqd); + + /**@ingroup qurt_mq_destroy + Destroys the message queue. This function ought to be + called from the process that called qurt_mq_create(). + + @datatypes + #qurt_mqd_t + + @param[in] mqd Pointer to the message queue identifier. + + @return + #QURT_EOK -- Message queue destroy was successfully.\n + #QURT_EBADR -- Invalid descriptor.\n + #QURT_ENOTALLOWED -- Message queue close is not called from client side. + + @dependencies + None. +*/ +int qurt_mq_destroy(qurt_mqd_t mqd); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif //QURT_MQ_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mutex.h new file mode 100755 index 0000000000000..4ad6b270cdde6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mutex.h @@ -0,0 +1,211 @@ +#ifndef QURT_MUTEX_H +#define QURT_MUTEX_H +/** + @file qurt_mutex.h + @brief Prototypes of mutex API. + This is mostly a user space mutex, but calls the + kernel to block if the mutex is taken. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT mutex type. + + Both non-recursive mutex lock and unlock, and recursive + mutex lock and unlock can be applied to this type. + */ +typedef union qurt_mutex_aligned8{ + /** @cond */ + struct { + unsigned int holder; + unsigned int count; + unsigned int queue; + unsigned int wait_count; + }; + unsigned long long int raw; + /** @endcond */ +} qurt_mutex_t; +/** @} */ /* end_addtogroup mutex_types */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/* @addtogroup mutex_const_macros +@{ */ +#define MUTEX_MAGIC 0xfe /**< */ +#define QURTK_FUTEX_FREE_MAGIC 0x1F // 11111 /**< */ +#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}} /**< Suitable as an initializer for a + variable of type qurt_mutex_t. */ +/* @} */ /* end_addtogroup mutex_const_macros */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_mutex_init + Initializes a mutex object. + The mutex is initially unlocked. + + @note1hang Each mutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_mutex_destroy() + when this object is not used anymore + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the mutex object. Returns the initialized object. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_destroy + Destroys the specified mutex. + + @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_mutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_mutex_lock + Locks the specified mutex. + If a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. + + @note1hang A thread is suspended indefinitely if it locks a mutex that it has already + locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}). + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_lock(qurt_mutex_t *lock); /* blocking */ + +/**@ingroup func_qurt_mutex_lock_timed + Locks the specified mutex. + When a thread performs a lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + When a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared + resource. If the duration of suspension exceeds the timeout duration, wait is + terminated and no access to mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object; specifies the mutex to lock. + @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration); + +/**@ingroup func_qurt_mutex_unlock + Unlocks the specified mutex. \n + More than one thread can be suspended on a mutex. When the mutex is unlocked, only the + highest-priority thread waiting on the mutex is awakened. If the awakened thread has + higher priority than the current thread, a context switch occurs. + + @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first + lock. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to unlock. + + @return + None. + + @dependencies + None. + */ +void qurt_mutex_unlock(qurt_mutex_t *lock); /* unlock */ + +/**@ingroup func_qurt_mutex_try_lock + @xreflabel{hdr:qurt_mutex_try_lock} + Attempts to lock the specified mutex. + If a thread performs a try_lock operation on a mutex that is not in use, the thread gains + access to the shared resource that is protected by the mutex, and continues executing. + + @note1hang If a thread performs a try_lock operation on a mutex that it has already locked + or is in use by another thread, qurt_mutex_try_lock immediately returns with a + nonzero result value. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object. Specifies the mutex to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_mutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_MUTEX_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_os_services.h new file mode 100755 index 0000000000000..cbc4c239e9620 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_os_services.h @@ -0,0 +1,24 @@ +/*============================================================================= + + qurt_os_services.c + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ + +#define QURT_OS_SERVICE_THREAD "/os/thread" /**< Thread service */ +#define QURT_OS_SERVICE_FS_HUB "/os/fs_hub" /**< file-system hub */ +#define QURT_OS_SERVICE_CALLBACK "/os/callback" /**< QDI callback service */ +#define QURT_OS_SERVICE_INTERRUPTS "/os/interrupt" /**< Interrupt service */ +#define QURT_OS_SERVICE_PROXY "/os/proxy" /**< QDI proxy serice */ +#define QURT_OS_SERVICE_MEMORY "/os/memory" /**< Memory management service */ +#define QURT_OS_SERVICE_MEMPOOL "/os/mempool" /**< Pool management service */ +#define QURT_OS_SERVICE_PROCESS "/os/process" /**< Process management service */ +#define QURT_OS_SERVICE_MMAP "/os/mem_mapper" /**< mmapper service */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex.h new file mode 100755 index 0000000000000..61aee5cba7ce8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_PIMUTEX_H +#define QURT_PIMUTEX_H 1 +/** + @file qurt_pimutex.h + @brief Prototypes of qurt_pimutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pimutex_init + Initializes a priority inheritance mutex object. + The priority inheritance mutex is initially unlocked. + + This function works the same as qurt_mutex_init(). + + @note1hang Each pimutex-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_pimutex_destroy() + when this object is not used anymore + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the priority inheritance mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_destroy + Destroys the specified priority inheritance mutex. + + @note1hang Priority inheritance mutexes must be destroyed when they are no longer in + use. Failure to do this causes resource leaks in the QuRT kernel.\n + @note1cont Priority inheritance mutexes must not be destroyed while they are still in use. + If this occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_lock + Requests access to a shared resources. If a thread performs a lock operation on a mutex + that is not in use, the thread gains access to the shared resource that the mutex protects, + and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + If a thread is suspended on a priority inheritance mutex, and the priority of the suspended + thread is higher than the priority of the thread that has locked the mutex, the thread + with the mutex acquires the higher priority of the suspended thread. The locker thread blocks + until the lock is available. + + @note1hang A thread is not suspended if it locks a priority inheritance mutex that it has + already locked . However, the mutex does not become available to other + threads until the thread performs a balanced number of unlocks on the mutex.\n + @note1cont When multiple threads compete for a mutex, the lock operation for a priority + inheritance mutex is slower than it is for a recursive mutex. + In particular, it is about 10 times slower when the mutex is available for locking, + and slower (with greatly varying times) when the mutex is already locked. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_lock(qurt_mutex_t *lock); + + +/**@ingroup func_qurt_pimutex_lock_timed + Locks a priority inheritance mutex with timeout. + + A thread can lock a priority inheritance mutex for multiple times. The mutex is not + available to other threads until the thread performs the same number of mutex unlock + operations. + + If a thread performs a lock operation on a mutex that is already locked by another thread, + the thread is moved to waiting state. When the mutex becomes available again (because the + other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex. + + If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread + is higher than the priority of the thread that has locked the mutex, the priority of the thread + that has locked the mutex is raised to the same priority of the waiting thread. + + If the duration of waiting exceeds the timeout duration, the waiting is terminated, and + the function returns QURT_ETIMEDOUT as a failure of the mutex lock. + + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the mutex object to lock. + @param[in] duration Duration (in microseconds) to wait. The duration value must be between + #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + #QURT_EINVALID -- Duration is out of range + + @dependencies + None. + + */ +int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + + +/**@ingroup func_qurt_pimutex_unlock + Releases access to a shared resource; unlocks the specified priority inheritance mutex. \n + More than one thread can be suspended on a priority inheritance mutex. When the mutex + is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + When a thread unlocks a priority inheritance mutex, its thread priority is restored to its + original value from any higher priority value that it acquired from another thread + suspended on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_pimutex_try_lock + Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n + If a thread performs a try_lock operation on a priority inheritance mutex that is not in + use, the thread gains access to the shared resource that is protected by the mutex, and + continues executing. + If a thread performs a try_lock operation on a priority inheritance mutex that is already + in use by another thread, qurt_pimutex_try_lock immediately returns with a + nonzero result value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the priority inheritance mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_pimutex_try_lock(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex2.h new file mode 100755 index 0000000000000..b809f163cbfd2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex2.h @@ -0,0 +1,162 @@ +#ifndef QURT_PIMUTEX2_H +#define QURT_PIMUTEX2_H +/** + @file qurt_pimutex2.h + @brief Prototypes of pimutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_pimutex2_init + Initializes a recursive mutex object. + + @deprecated use #qurt_pimutex_init instead. + + The recursive mutex is initially unlocked. + + Objects of type pimutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_destroy + + @deprecated use #qurt_pimutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1cont Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code should destroy an pimutex2 object prior to + deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures + that all qurt_pimutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_lock + + @deprecated use #qurt_pimutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not being used, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing. + + If a thread performs a lock operation on a recursive mutex that is already being used by + another thread, the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_pimutex2_unlock + + @deprecated use #qurt_pimutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex is awakened. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_pimutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_pimutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_pimutex2_lock(). If a call to qurt_pimutex2_lock() would + succeed immediately, this function behaves similarly, and returns 0 for success. + If a call to qurt_pimutex2_lock() would not succeed immediately, this function has + no effect and returns non-zero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pipe.h new file mode 100755 index 0000000000000..6bdaa044f8640 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pipe.h @@ -0,0 +1,479 @@ +#ifndef QURT_PIPE_H +#define QURT_PIPE_H +/** + @file qurt_pipe.h + @brief Prototypes of the pipe interface API + This is a pipe or message queue + It blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021,2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup pipe_types +@{ */ +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define QURT_PIPE_MAGIC 0xF1FEF1FE /**< Magic. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */ +#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */ + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** QuRT pipe data values type. */ +typedef unsigned long long int qurt_pipe_data_t; + +/** QuRT pipe type.*/ +typedef struct { + /** @cond */ + qurt_mutex_t pipe_lock; + qurt_sem_t senders; + qurt_sem_t receiver; + unsigned int size; + unsigned int sendidx; + unsigned int recvidx; + void (*lock_func)(qurt_mutex_t *); + void (*unlock_func)(qurt_mutex_t *); + int (*try_lock_func)(qurt_mutex_t *); + void (*destroy_lock_func)(qurt_mutex_t *); + unsigned int magic; + qurt_pipe_data_t *data; + /** @endcond */ +} qurt_pipe_t; + +/** QuRT pipe attributes type. */ +typedef struct { + /** @cond */ + qurt_pipe_data_t *buffer; + unsigned int elements; + unsigned char mem_partition; + /** @endcond */ +} qurt_pipe_attr_t; + +/** @} */ /* end_addtogroup pipe_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_pipe_attr_init + @xreflabel{hdr:qurt_pipe_attr_init} + Initializes the structure that sets the pipe attributes when a pipe is created. + + After an attribute structure is initialized, the individual attributes in the structure are + explicitly set using the pipe attribute operations. + + The attribute structure is assigned the following default values: \n + - buffer -- 0 \n + - elements -- 0 \n + - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr) +{ + attr->buffer = NULL; + attr->elements = 0; + attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer + @xreflabel{sec:qurt_pipe_attr_set_buffer} + Sets the pipe buffer address attribute.\n + Specifies the base address of the memory area to use for the data buffer of a pipe. + + The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the + memory area used as a pipe data buffer. The user is responsible for allocating the + memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t \n + #qurt_pipe_data_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] buffer Pointer to the buffer base address. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer) +{ + attr->buffer = buffer; +} + +/**@ingroup func_qurt_pipe_attr_set_elements + @xreflabel{sec:qurt_pipe_attr_set_elements} + Specifies the length of the memory area to use for the data buffer of a pipe. + + The length is expressed in terms of the number of 64-bit data elements that + can be stored in the buffer. + + The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify + the memory area used as a pipe data buffer. The user is responsible for + allocating the memory area used for the buffer. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] elements Pipe length (64-bit elements). + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements) +{ + attr->elements = elements; +} + +/**@ingroup func_qurt_pipe_attr_set_buffer_partition + @xreflabel{sec:qurt_pipe_attr_set_buffer_partition} + Specifies the memory type where a pipe's buffer is allocated. + Allocate pipes in RAM or TCM/LPM. + + @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created + with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_attr_t + + @param[in,out] attr Pointer to the pipe attribute structure. + @param[in] mem_partition Pipe memory partition. Values: \n + - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n + - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition) +{ + attr->mem_partition = mem_partition; +} + +/**@ingroup func_qurt_pipe_create + Creates a pipe.\n + Allocates a pipe object and its associated data buffer, and initializes the pipe object. + + @note1hang The buffer address and size stored in the attribute structure specify how the + pipe data buffer is allocated. + + @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created + using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the created pipe object. + @param[in] attr Pointer to the attribute structure used to create the pipe. + + @return + #QURT_EOK -- Pipe created. \n + #QURT_EFAILED -- Pipe not created. \n + #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM. + + @dependencies + None. + */ +int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_init + Initializes a pipe object using an existing data buffer. + + @note1hang The buffer address and size stored in the attribute structure must + specify a data buffer that the user has already allocated. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_attr_t + + @param[out] pipe Pointer to the pipe object to initialize. + @param[in] attr Pointer to the pipe attribute structure used to initialize the pipe. + + @return + #QURT_EOK -- Success. \n + #QURT_EFAILED -- Failure. + + @dependencies + None. + */ +int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr); + +/**@ingroup func_qurt_pipe_destroy + @xreflabel{sec:qurt_pipe_destroy} + Destroys the specified pipe. + + @note1hang Pipes must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel. + Pipes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_destroy(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_delete + Deletes the pipe.\n + Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its + associated data buffer. + + @note1hang Delete pipes only if they were created using qurt_pipe_create + (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n + @note1cont Pipes must be deleted when they are no longer in use. Failure to do this + causes resource leaks in the QuRT kernel.\n + @note1cont Pipes must not be deleted while they are still in use. If this occurs, the + behavior of QuRT is undefined. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_pipe_delete(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_send + Writes a data item to the specified pipe. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + None. + + @dependencies + None. +*/ +void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_receive + Reads a data item from the specified pipe. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + Integer containing the 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe); + +/**@ingroup func_qurt_pipe_try_send + Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n + + If a thread writes to a full pipe, the operation returns immediately with success set to -1. + Otherwise, success is always set to 0 to indicate a successful write operation. + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to write to. + @param[in] data Data item to write. + + @return + 0 -- Success. \n + -1 -- Failure (pipe full). + + @dependencies + None. +*/ +int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_try_receive + Reads a data item from the specified pipe (without suspending the thread if the pipe is + empty).\n + If a thread reads from an empty pipe, the operation returns immediately with success set + to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n + + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[out] success Pointer to the operation status result. + + @return + Integer containing a 64-bit data item from pipe. + + @dependencies + None. +*/ +qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success); + +/**@ingroup func_qurt_pipe_receive_cancellable + Reads a data item from the specified pipe (with suspend), cancellable. + + If a thread reads from an empty pipe, it is suspended on the pipe. When another thread + writes to the pipe, the suspended thread is awakened and can then read data from the pipe. + The operation is cancelled if the user process of the calling thread is killed, + or if the calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] result Pointer to the integer containing the 64-bit data item from pipe. + + @return + #QURT_EOK -- Receive completed. \n + #QURT_ECANCEL -- Receive canceled. \n + #QURT_EDESTROY -- Receive destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result); + +/**@ingroup func_qurt_pipe_send_cancellable + @xreflabel{hdr:qurt_pipe_send_cancellable} + Writes a data item to the specified pipe (with suspend), cancellable. \n + If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads + from the pipe, the suspended thread is awakened and can then write data to the pipe. + The operation is canceled if the user process of the calling thread is killed, or if the + calling thread must finish its current QDI invocation and return to user space. + Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY + if the pipe gets destroyed . + + Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single + 64-bit data item per operation. + + @note1hang Transfer data items larger than 64 bits by reading and writing + pointers to the data, or by transferring the data in consecutive 64-bit chunks. + + @datatypes + #qurt_pipe_t \n + #qurt_pipe_data_t + + @param[in] pipe Pointer to the pipe object to read from. + @param[in] data Data item to write. + + @return + #QURT_EOK -- Send completed. \n + #QURT_ECANCEL -- Send canceled. \n + #QURT_EDESTROY -- Send destroyed. \n + #QURT_ENOTALLOWED -- Pipe is not initialized + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data); + +/**@ingroup func_qurt_pipe_is_empty + Returns a value indicating whether the specified pipe contains any data. + + @datatypes + #qurt_pipe_t + + @param[in] pipe Pointer to the pipe object to read from. + + @return + 1 -- Pipe contains no data. \n + 0 -- Pipe contains data. + + @dependencies + None. +*/ +int qurt_pipe_is_empty(qurt_pipe_t *pipe); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PIPE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmem_manager.h new file mode 100755 index 0000000000000..8c8da985228b9 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmem_manager.h @@ -0,0 +1,82 @@ +#ifndef QURT_PMEM_MANAGER_H +#define QURT_PMEM_MANAGER_H +/** + @file qurt_pmem_manager.h + Prototypes of kernel physical memory manager APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*===================================================================== + Constants and macros + ======================================================================*/ + +/* physical memory API return error code */ +#define QURT_PMEM_SUCCESS 0 +#define QURT_PMEM_NO_PRIV 1 +#define QURT_PMEM_RETRY 2 +#define QURT_PMEM_OVERLAP 3 +#define QURT_PMEM_NOT_EXIST 4 +#define QURT_PMEM_INIT_FAILURE 5 +#define QURT_PMEM_OUTSTANDING_MAPPING 6 +#define QURT_PMEM_GENERIC_FAILURE 7 +#define QURT_PMEM_ENTRY_FOUND 8 +#define QURT_PMEM_REACH_END 9 +#define QURT_PMEM_UNCLAIMED 10 +#define QURT_PMEM_ALREADY_CLAIMED 11 + +/*===================================================================== + Functions +======================================================================*/ + +/**@ingroup func_qurt_pmem_acquire + Acquire the ownership of a specific physical memory region. + + @note1hang The ownership will be the caller + + @param[in] ppage Starting physical page number + @param[in] pnum Number of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. +*/ +int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum); + +/**@ingroup func_qurt_pmem_release + Release the ownership of a specific physical memory region. + + @param[in] ppage The start of physical page number + @param[in] pnum The numbers of physical pages + + @return + #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n + #QURT_PMEM_NOT_EXIST -- The physical memory range is not usable. \n + #QURT_PMEM_OUTSTANDING_MAPPING -- There is outstanding mapping in this range + #QURT_PMEM_SUCCESS -- Succeed to claim ownership. + + @dependencies + None. + */ +int qurt_pmem_release(unsigned int ppage, unsigned int pnum); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMEM_MANAGER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmu.h new file mode 100755 index 0000000000000..73ea8eba04abf --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmu.h @@ -0,0 +1,121 @@ +#ifndef QURT_PMU_H +#define QURT_PMU_H +/** + @file qurt_pmu.h + Prototypes of pipe interface API. + A pipe or message queue blocks when too full (send) or empty (receive). + Unless using a nonblocking option, all datagrams are 64 bits. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_pmu_set + Sets the value of the specified PMU register. + + @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0 + through PMUCNT3. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @param[in] reg_value Register value. + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_set (int reg_id, unsigned int reg_value); + +/**@ingroup func_qurt_pmu_get + Gets the PMU register.\n + Returns the current value of the specified PMU register. + + @param[in] reg_id PMU register. Values: + - #QURT_PMUCNT0 + - #QURT_PMUCNT1 + - #QURT_PMUCNT2 + - #QURT_PMUCNT3 + - #QURT_PMUCFG + - #QURT_PMUEVTCFG + - #QURT_PMUCNT4 + - #QURT_PMUCNT5 + - #QURT_PMUCNT6 + - #QURT_PMUCNT7 + - #QURT_PMUEVTCFG1 @tablebulletend + + @return + Integer -- Current value of the specified PMU register. + + @dependencies + None. + */ +unsigned int qurt_pmu_get (int reg_id); + +/**@ingroup func_qurt_pmu_enable + Enables or disables the Hexagon processor PMU. + Profiling is disabled by default. + + @note1hang Enabling profiling does not automatically reset the count registers -- this must + be done explicitly before starting event counting. + + @param[in] enable Performance monitor. Values: \n + - 0 -- Disable performance monitor \n + - 1 -- Enable performance monitor @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_pmu_enable (int enable); + +/**@ingroup func_qurt_pmu_get_pmucnt + Reads PMU counters in a single trap. + + @param[out] buf Pointer to a buffer to save values read from PMU counters. + buffer size should be at least 32 bytes to read all eight PMU counters. + + @return + #QURT_EOK -- Successful read.\n + #QURT_EFATAL -- Failure. + + @dependencies + None. + */ +int qurt_pmu_get_pmucnt (void * buf); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PMU_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_power.h new file mode 100755 index 0000000000000..2ee4d29a73976 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_power.h @@ -0,0 +1,140 @@ +#ifndef QURT_POWER_H +#define QURT_POWER_H +/** + @file qurt_power.h + @brief Prototypes of power API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +/*============================================================================= + + EDIT HISTORY FOR MODULE + + This section contains comments describing changes made to the module. + Notice that changes are listed in reverse chronological order. + + +when who what, where, why +-------- --- ------------------------------------------------------------ +03/03/11 op Add header file +12/12/12 cm (Tech Pubs) Edited/added Doxygen comments and markup. +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +/**@ingroup func_qurt_power_shutdown_fail_exit + Returns from Power Collapse mode when power collapse cannot proceed. + + This function unmasks the global interrupt. This operation is used only when the thread is + recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}). + + @return + #QURT_EOK -- Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_fail_exit qurt_power_exit + +/**@ingroup func_qurt_power_shutdown_exit + Undoes state changes made preparing for power collapse.\n + This function unmasks the global interrupts. + + @return + #QURT_EOK --Operation was successfully performed. + + @dependencies + None. + */ +#define qurt_power_shutdown_exit qurt_power_exit +/**@endcond */ + +/**@ingroup func_qurt_system_ipend_get + Gets the IPEND register.\n + + @note1hang Returns the current value of the Hexagon processor IPEND register. The return value + is a mask value that identifies the individual interrupts that are pending. \n + + @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A + mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the + corresponding interrupt is not pending. \n + + @return + Return the IPEND register value. + + @dependencies + None. + */ +unsigned int qurt_system_ipend_get (void); + + +/**@ingroup func_qurt_system_vid_get + Gets the VID register. \n + + @note1hang Returns the current value of the Hexagon processor VID register. The return value is + the vector number of a second-level interrupt that has been accepted by the Hexagon + processor core.\n + + @return + Return the VID register value that is the L2 VIC interrupt number accepted by the processor. + Valid range is 0 to 1023. + + @dependencies + None. + */ +unsigned int qurt_system_vid_get(void); + +/**@ingroup func_qurt_power_shutdown_get_pcycles + Gets the number of power collapses and processor cycles for entering and exiting most recent + power collapse. + + @note1hang If no power collapse has occured yet, processor cycle numbers are zero. + + @param[out] enter_pcycles Number of processor cycles for entering most + recent power collapse. + @param[out] exit_pcycles Number of processor cycles for exiting most + recent power collapse. + @return + Zero -- No power collapses have occurred. \n + Nonzero -- Number of power collapses that have occurred since + the processor was reset. + + @dependencies + None. + */ +int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles, unsigned long long *exit_pcycles ); + +/**@ingroup func_qurt_system_tcm_set_size + Set size of TCM to save during full power collapse. + + @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in + XML, the size is truncated to the size defined in XML. + + @param[in] new_size Size of TCM to save. + + @return + Zero -- Size successfully set \n + -1 -- Size of 0 passed + + @dependencies + None. + */ +int qurt_system_tcm_set_size(unsigned int new_size); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_POWER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_printf.h new file mode 100755 index 0000000000000..a775d8a815918 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_printf.h @@ -0,0 +1,44 @@ +#ifndef QURT_PRINTF_H +#define QURT_PRINTF_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + @file qurt_printf.h + Prototypes of printf API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup chapter_function_tracing +@{ */ + +int qurt_printf(const char* format, ...); + +int qurt_vprintf(const char* format, va_list args); + +/** @} */ /* end_addtogroup chapter_function_tracing */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_PRINTF_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_process.h new file mode 100755 index 0000000000000..0df9ddc2d4a70 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_process.h @@ -0,0 +1,995 @@ +#ifndef QURT_PROCESS_H +#define QURT_PROCESS_H +/** + @file qurt_process.h + @brief Prototypes of QuRT process control APIs. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_callback.h" +#include "qurt_consts.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup process_types +@{ */ +#define QURT_PROCESS_ATTR_NAME_MAXLEN QURT_MAX_NAME_LEN /**< Maximum length of the process name. */ +#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN 128 /**< Maximum length of the path of binary/ELF for this process. */ +#define QURT_PROCESS_ATTR_CAP_MAXLEN 128 /**< Maximum length for a resource name. */ + +/** QuRT process capability wildcard strings */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL "ALLOW_ALL" /**< Capability wild-card for full access */ +#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE "ALLOW_NONE" /**< Capability wild-card for no access */ + +/** QuRT process capability states */ +#define QURT_PROCESS_ATTR_CAP_ENABLED 0x1 /**< Capability enabled*/ +#define QURT_PROCESS_ATTR_CAP_DISABLED 0x0 /**< Capability disabled*/ + +/* QuRT process thread attributes. */ +#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0 /**< Default ceiling priority of the threads in the new process. */ +#define QURT_PROCESS_DEFAULT_MAX_THREADS -1 /**< Default number of threads in the new process. + -1 indicates that the limit is set to the maximum supported by the system. */ + +/* QuRT process flags. */ +#define QURT_PROCESS_SUSPEND_ON_STARTUP (1U) /**< Suspend the new processes just before calling main(). */ +#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */ +#define QURT_PROCESS_ISLAND_RESIDENT (1u << 2) /**< Process is island resident. */ +#define QURT_PROCESS_RESTARTABLE (1u << 3) /**< Indicates that the process is restartable */ +#define QURT_PROCESS_UNTRUSTED (1u << 7) /**< Starts the new process as unsigned process. */ + +/* QuRT process debugging session status.*/ +#define QURT_DEBUG_NOT_START 0 /**< Debug is not started. */ +#define QURT_DEBUG_START 1 /**< Debug has started. */ + +/** Process Suspend Options */ +#define QURT_PROCESS_SUSPEND_DEFAULT 0 + +/** Process Resume Options */ +#define QURT_PROCESS_RESUME_DEFAULT 0 + + +/* QuRT process types. */ +typedef enum { + QURT_PROCESS_TYPE_RESERVED, /**< Process type is reserved. \n */ + QURT_PROCESS_TYPE_KERNEL, /**< Kernel process. \n*/ + QURT_PROCESS_TYPE_SRM, /**< SRM process. \n*/ + QURT_PROCESS_TYPE_SECURE, /**< Secure process. \n*/ + QURT_PROCESS_TYPE_ROOT, /**< Root process. \n*/ + QURT_PROCESS_TYPE_USER, /**< User process. */ +}qurt_process_type_t; + +/** QuRT process callback types. */ +typedef enum { + QURT_PROCESS_DUMP_CB_ROOT, /**< Register the callback that executes in the + root process context. \n */ + QURT_PROCESS_DUMP_CB_ERROR, /**< Register the user process callback that is + called after threads in the process are frozen. \n */ + QURT_PROCESS_DUMP_CB_PRESTM, /**< Register the user process callback that is + called before threads in the process are frozen. \n*/ + QURT_PROCESS_DUMP_CB_MAX /**< Reserved for error checking. */ +}qurt_process_dump_cb_type_t; + +/** QuRT process dump attributes. */ +typedef struct _qurt_pd_dump_attr{ + /** @cond */ + unsigned int enabled; /**< Process dump is enabled. */ + const char *path; /**< Process dump path. */ + unsigned int path_len; /**< Length of process dump path. */ + /** @endcond */ +}qurt_pd_dump_attr_t; + +/** QuRT process capability resource type */ +enum qurt_process_cap_type_t { + QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0, /**< Number of entries in the capability structure*/ + QURT_PROCESS_CAP_TYPE_DRIVER=1, /**< Driver resource */ + QURT_PROCESS_CAP_TYPE_MAX /**< Maximum identifier */ +}; + +/** QuRT process capability structure */ +typedef struct _qurt_capability { + enum qurt_process_cap_type_t type; /**< Resource type */ + char name[QURT_PROCESS_ATTR_CAP_MAXLEN]; /**< Resource name*/ + unsigned long long cap; /**< Capabilities allowed for this resource */ +}qurt_capability_t; + +/** QuRT process attributes. */ +typedef struct _qurt_process_attr { + /** @cond */ + char name[QURT_PROCESS_ATTR_NAME_MAXLEN]; /**< Name of the new process. */ + char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the binary for the new process. */ + char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN]; /**< Path of the DTB ELF for the new process. */ + int flags; /**< Flags as indicated by QuRT process flags. */ + unsigned int sw_id; /**< Software ID of the process be load. */ + unsigned sid; /**< Stream ID of the process being spawned. */ + unsigned max_threads; /**< Maximum number of threads that the new process can create. */ + unsigned short ceiling_prio; /**< Maximum priority at which threads can be + created by new process. */ + qurt_process_type_t type; /**< Process type as indicated by + #qurt_process_type_t. */ + qurt_pd_dump_attr_t dump_attr; /**< Process dump attributes for the new process + as indicated by #qurt_pd_dump_attr_t. */ + qurt_capability_t *capabilities; /**< Pointer to array of structure of type + qurt_capability_t */ + /** @endcond */ +} qurt_process_attr_t; + +/** @} */ /* end_addtogroup process_types */ + +/*============================================================================= +FUNCTIONS +=============================================================================*/ + /** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_create + Creates a process with the specified attributes, and starts the process. + + The process executes the code in the specified executable ELF file. + + @datatypes + #qurt_process_attr_t + + @param[out] attr Accepts an initialized process attribute structure, which specifies + the attributes of the created process. + + @return + Postive return value Indicates Process ID. + Negative return value Indicates any of follwoing error, + #-QURT_EPRIVILEGE -- Caller does not have privilege for this operation \n + #-QURT_EMEM -- Not enough memory to perform the operation \n + #-QURT_EFAILED -- Operation failed \n + #-QURT_ENOTALLOWED -- Operation not allowed \n + #-QURT_ENOREGISTERED -- Not registered \n + #-QURT_ENORESOURCE -- Resource exhaustion \n + #-QURT_EINVALID -- Invalid argument value + #QURT_EFATAL -- attr is NULL + + @dependencies + None. +*/ +int qurt_process_create (qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_get_id + Returns the process identifier for the current thread. + + @return + None. + + @dependencies + Process identifier for the current thread. +*/ +int qurt_process_get_id (void); +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_get_uid + Returns the user identifier for the current thread. + + @return + None. + + @dependencies + User identifier for the current thread. +*/ +int qurt_process_get_uid (void); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_init + Initializes the structure that sets the process attributes when a thread is created. + + After an attribute structure is initialized, the individual attributes in the structure can + be explicitly set using the process attribute operations. + + Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize + operation. + + @inputov{table_process_attribute_defaults} + + @datatypes + #qurt_process_attr_t + + @param[out] attr Pointer to the structure to initialize. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_process_attr_init (qurt_process_attr_t *attr) +{ + attr->name[0] = '\0'; + attr->path[0] = '\0'; + attr->dtb_path[0] = '\0'; + attr->flags = 0; + attr->sw_id = 0; + attr->sid = 0; + attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS; + attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO; + attr->type = QURT_PROCESS_TYPE_RESERVED; + attr->dump_attr.enabled = 0; + attr->dump_attr.path = NULL; + attr->dump_attr.path_len = 0; + attr->capabilities = NULL; +} + +/**@ingroup func_qurt_process_attr_set_executable + Sets the process name in the specified process attribute structure. + + Process names identify process objects that are already + loaded in memory as part of the QuRT system. + + @note1hang Process objects are incorporated into the QuRT system at build time. + + @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] name Pointer to the process name. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name); + +/**@ingroup func_qurt_process_attr_set_binary_path + Sets the binary path for the process loading in the specified process attribute structure. + + Path specifies the binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_dtb_path + Sets the DTB binary path for the process loading in the specified process attribute structure. + + Path specifies the DTB binary to load for this process. + + @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1. + + @datatypes + #qurt_process_attr_t + + @param[in] attr Pointer to the process attribute structure. + @param[in] path Pointer to the binary path. + + @return + None. + + @dependencies + None. +*/ +void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path); + +/**@ingroup func_qurt_process_attr_set_flags +Sets the process properties in the specified process attribute structure. +Process properties are represented as defined symbols that map into bits +0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing +together the individual property symbols. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical. + This attribute will be used by error services, + to decide whether to kill user pd or whole subsystem. + QURT_PROCESS_ISLAND_RESIDENT Process will be marked as island resident. + QURT_PROCESS_RESTARTABLE Process will be marked as restartable. + QURT_PROCESS_UNTRUSTED Process will be marked as unsigned process. +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags) +{ + attr->flags = flags; +} +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_sid +Sets the process streamID in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sid streamID to set for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid) +{ + attr->sid = sid; +} +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_set_max_threads +Sets the maximum number of threads allowed in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] max_threads Maximum number of threads allowed for this process. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads) +{ + attr->max_threads = max_threads; +} + +/**@ingroup func_qurt_process_attr_set_sw_id +Sets the software ID of the process to load in the specified process attribute structure. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] sw_id Software ID of the process, used in authentication. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id) +{ + attr->sw_id = sw_id; +} + +/**@ingroup func_qurt_process_attr_set_ceiling_prio +Sets the highest thread priority allowed in the specified process attribute structure. +Refer qurt_thread.h for priority ranges. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] prio Priority. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio) +{ + attr->ceiling_prio = prio; +} +/** @endcond */ + +/** @cond internal_only*/ +/**@ingroup func_qurt_process_attr_set_dump_status +Sets the process domain dump-enabled field in the process domain dump attributes. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] enabled 1 -- Process domain dump is collected \n + 0 -- Process domain dump is not collected + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled) +{ + attr->dump_attr.enabled = enabled; +} + +/**@ingroup func_qurt_process_attr_set_dump_path +Sets the process domain dump path and type. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] path Path where the process domain dumps must be saved. +@param[in] path_len Length of the path string. + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len) +{ + attr->dump_attr.path = path; + attr->dump_attr.path_len = (unsigned int)path_len; +} + +/**@ingroup func_qurt_process_attr_set_capabilities +Sets list of capabilities available to this process. + +@datatypes +#qurt_process_attr_t + +@param[in] attr Pointer to the process attribute structure. +@param[in] capabilities Pointer to array of structures of type qurt_capability_t defining + resources and capabilites + +@return +None. + +@dependencies +None. +*/ +static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities) +{ + attr->capabilities = capabilities; +} + +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_cmdline_get +Gets the command line string associated with the current process. +The Hexagon simulator command line arguments are retrieved using +this function as long as the call is made +in the process of the QuRT installation, and with the +requirement that the program runs in a simulation environment. + +If the function modifies the provided buffer, it zero-terminates +the string. It is possible that the function does not modify the +provided buffer, so the caller must set buf[0] to a NULL +byte before making the call. A truncated command line is returned when +the command line is longer than the provided buffer. + +@param[in] buf Pointer to a character buffer that must be filled in. +@param[in] buf_siz Size (in bytes) of the buffer pointed to by the buf argument. + +@return +None. + +@dependencies +None. +*/ +void qurt_process_cmdline_get(char *buf, unsigned buf_siz); + +/**@ingroup func_qurt_process_get_thread_count +Gets the number of threads present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of threads in the process indicated by PID, if positive value is obtained +Negative error code if failed include: + QURT_EFATAL - Invalid PID + -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID + +@dependencies +None. +*/ +int qurt_process_get_thread_count(unsigned int pid); + +/**@ingroup func_qurt_process_get_thread_ids +Gets the thread IDs for a process indicated by PID. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a user passed buffer that must be filled in with thread IDs. +@param[in] thread_num Number of thread IDs requested. + +@return +#QURT_EOK - Success +#QURT_EFATAL - Failed, ptr is NULL + +@dependencies +None. + */ +int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num); +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_dump_get_mem_mappings_count +Gets the number of mappings present in the process indicated by the PID. + +@param[in] pid PID of the process for which the information is required. + +@return +Number of mappings for the process indicated by the PID. + +@dependencies +None. +*/ +int qurt_process_dump_get_mem_mappings_count(unsigned int pid); + +/**@ingroup func_qurt_process_dump_get_mappings +Gets the mappings for a specified PID. + +@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute. + +@param[in] pid PID of the process for which the information is required. +@param[in] ptr Pointer to a buffer that must be filled in with mappings. +@param[in] count Count of mappings requested. + +@return +Number of mappings filled in the buffer passed by the user. + +@dependencies +None. +*/ +int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count); +/** @endcond */ +/** @cond rest_reg_dist */ +/**@ingroup func_qurt_process_attr_get +Gets the attributes of the process with which it was created. + +@datatypes +#qurt_process_attr_t + +@param[in] pid PID of the process for which the information is required. +@param[in,out] attr Pointer to the user allocated attribute structure. + +@return +#QURT_EOK - Success +#QURT_INVALID - Invalid PID +#QURT_EFATAL - attr is NULL + +@dependencies +None. +*/ +int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr); + +/**@ingroup func_qurt_process_dump_register_cb +Registers the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information. +@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n + #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n + #QURT_PROCESS_DUMP_CB_ERROR -- After threads are frozen and captured. \n + #QURT_PROCESS_DUMP_CB_ROOT -- After threads are frozen and captured, and CB_ERROR type of callbacks + are called. +@param[in] priority Priority. + +@return +#QURT_EOK -- Success \n +Other values -- Failure + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority); + +/**@ingroup func_qurt_process_dump_deregister_cb +Deregisters the process domain dump callback. + +@datatypes +#qurt_cb_data_t \n +#qurt_process_dump_cb_type_t + +@param[in] cb_data Pointer to the callback information to deregister. +@param[in] type Callback type. + +@return +#QURT_EOK -- Success.\n +Other values -- Failure. + QURT_EFATAL if cb_data is NULL + QURT_EINVALID If invalid cb_type + QURT_EFAILED If invalid cb_data + +@dependencies +None. +*/ +int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type); + +/** @endcond */ +/** @cond internal_only*/ +/**@ingroup func_qurt_process_set_rtld_debug +Sets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in] address rtld_debug address. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address); + +/**@ingroup func_qurt_process_get_rtld_debug +Gets rtld_debug for a process. + +@param[in] pid PID of the process for which rtld_debug must be set. +@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned. + +@return +#QURT_EOK - Success +#QURT_EINVALID - Invalid PID +#QURT_EFATAL - Invalid address + +@dependencies +None. +*/ +int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address); +/** @endcond */ +/**@ingroup func_qurt_process_exit +Exits the current user process with an exit code. + +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exit(int exitcode); + +/**@ingroup func_qurt_process_kill +Kills the process represented by the PID with the exit code. + +@param[in] pid PID of the process to kill. +@param[in] exitcode Exit code. + +@return +#QURT_EFATAL -- No client found with the specified PID value \n +#QURT_EINVALID -- Invalid client \n +#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_kill(int pid, int exitcode); + + +/**@ingroup func_qurt_debugger_register_process +Registers the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. +@param[in] adr Address. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_register_process(int pid, unsigned int adr); + + +/**@ingroup func_qurt_debugger_deregister_process +Deregister the process indicated by the PID with the debug monitor. + +@param[in] pid PID of the process. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_debugger_deregister_process(int pid); + +/**@ingroup func_qurt_process_exec_callback +Executes callbacks in the user process as indicated by the client_handle argument. + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] callback_fn Callback function to execute. +@param[in] stack_base Stack address to use. +@param[in] stack_size Stack size. + +@return +#QURT_EOK -- Success + +@dependencies +None. +*/ +int qurt_process_exec_callback(int client_handle, + unsigned callback_fn, + unsigned stack_base, + unsigned stack_size); + +/**@ingroup func_qurt_process_get_pid +Gets the process ID of the process that the client_handle argument represents. + +@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id() + +@param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). +@param[in] pid Pointer to the address to store the PID. + +@return +#QURT_EOK -- Success +#QURT_EFATAL -- pid pointer passed as NULL + +@dependencies +None. +*/ +int qurt_process_get_pid(int client_handle, int * pid); + +/**@ingroup func_qurt_process_get_dm_status +Gets the debugging session status on the process represented by the pid argument. + +@param[in] pid Process ID +@param[in,out] status Address to store the status: \n + #QURT_DEBUG_NOT_START \n + #QURT_DEBUG_START + +@return +#QURT_EOK - Success \n +#QURT_EINVALID - Error + +@dependencies +None. +*/ +int qurt_process_get_dm_status( unsigned int pid, unsigned int *status); + + +/**@ingroup func_qurt_process_suspend_threads + Suspends user threads in a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in GuestOS/root process. + After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel + until they resume later. + + This function has one optional argument with one default option. + #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + This function call is a synchronous call, the function returns after the relevant threads are + completely suspended. + + If some user threads in the target user process are set as non-suspendable, this function call does + not suspend these threads. + + If the target user process is already suspended, this function call returns success as the + confirmation on the user process suspending. + + QuRT debugger monitor threads in the target user process are non-suspendable, this function call does + not suspend the threads. + + If the target user process is a secure user process, or a CPZ process, this function call returns error + without suspending the target user process. + + If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call + does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended + when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success while the user thread can be running in GuestOS, and is suspended + when exiting the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid process_id input \n + #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_suspend_threads (unsigned int process_id, unsigned int option); + + +/**@ingroup func_qurt_process_resume_threads + Resumes a user process with its process identifier. + The target user process can be a signed user process or an unsigned user process. + The caller is from a thread in the guest OS/root process. + After the user threads in the target user process resume, the kernel scheduler + can schedule the user threads to run based on their thread priorities. + + This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which + resumes user threads in the target user process. + + This is an asynchronous function, it returns after the kernel moves the user thread from + suspended state to runnable state. The threads are scheduled to run based on their thread priorities. + + This function call does not resume threads in the target user process that have been set as non-resumable. + + If the target user process have already resumed, this function call confirms that the user process resumes + by returning success. + + If the target user process is a secure user process or a CPZ process, this function call returns an error without + resuming operation. + + If user threads in the target user process run in the guest OS/root process via QDI call, this function + call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits + the guest OS. + + @param[in] process_id Process identifier. + @param[in] option Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process. + + @return + #QURT_EOK -- Success + #QURT_EINVALID -- Failure because of invalid process_id input. + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_process_resume_threads (unsigned int process_id, unsigned int option); + +/**@ingroup func_qurt_process_vtcm_window_set + Set a VTCM access window for a process. + The caller thread needs to be in SRM process. + + This is an synchronous function, it ensures all running threads of the process have the requested + window in effect.The requested view for all non-running thread will take in effect when they get + scheduled. + + @param[in] pid Process identifier. + @param[in] enable QURT_VTCM_WINDOW_ENABLE enforces VTCM access window defined by high and low offset. + QURT_VTCM_WINDOW_DISABLE high and low offset is ignored and VTCM access is fully + disabled for the process. + @param[in] high_offset Specifies the high window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT restore high offset to reset value. + @param[in] low_offset Specifies the low window offset, in 4K increments, from the base address of the VTCM. + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value. + + @note1hang + when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT and low offset is set as + QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled + via MMU mapping for the process. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset); + +/**@ingroup func_qurt_process_vtcm_window_get + Get the VTCM window for a process. + The caller thread needs to be in SRM process. + + + @param[in] pid Process identifier. + @param[out] enable address to store enable status if set + @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM + @param[out] low_offset address to return low window offset, in 4K increments, from the base address of the VTCM. + + @note1hang + User must first check the value of enable returned before checking high and low offset. + + @return + #QURT_EOK -- Success + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_ENOTSUPPORTED -- Failure because of the operation is not supported due to limitation in HW capabilities + + @dependencies + None. + */ +int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset); + +/**@ingroup func_qurt_process_set_group_config + Enable thread groups in the process with the ceiling priorities setup + + @param[in] process_id Process identifier. + @param[in] group_bitmask 64-bit mask of active thread groups + @param[in] ceiling_priorities array of ceiling priorities for thread group + + @note1hang + This API can only be called by root PD and can only be called once for each process, otherwise it will be + rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all + exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling + priority of group 0, it will be lowered to the ceiling value. + Examples 1: + group_bitmask = 0xD7; //'b11010111 + ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care + Exmaples 2: + group_mask = 0x5; //'b101 + ceiling_priorities[] = {240, 0, 20}; // 0 - does not care + + + @return + #QURT_EOK -- Success. + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_ENOTALLOWED -- The group has been configured already. + + @dependencies + None. + */ +int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask, + unsigned char *ceiling_priorities); + + +/**@ingroup func_qurt_process_stid_set + Set the specified stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[in] stid stid to be set + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level. + All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process. + When a non-default group_id is specified, the stid is set only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid); + +/**@ingroup func_qurt_process_stid_get + Get the stid for a process or for a thread group within a process. + + @param[in] pid Process identifier. + @param[in] group_id group identifier + @param[out] Pointer to a variable to return stid + + @note1hang + User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid. + When a non-default group_id is specified, the stid is returned only for a thread group. + + @return + #QURT_EOK -- Success + #QURT_EFATAL -- Invalid PID + #QURT_EVAL -- Failure because of invalid inputs. + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + + @dependencies + None. + */ +int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_profile.h new file mode 100755 index 0000000000000..2a50c461440f6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_profile.h @@ -0,0 +1,98 @@ +#ifndef QURT_PROFILE_H +#define QURT_PROFILE_H +/** + @file qurt_profile.h + QuRT profiling support. + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +==============================================================================*/ +#include "qurt_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup profiling_macros +@{ */ +#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */ +#define QURT_PROFILE_ENABLE 1 /**< Enable profiling. */ + +typedef unsigned int qurt_profile_param_t; + +#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */ + +/** @} */ /* end_addtogroup profiling_macros */ + +/** @addtogroup profiling_types + @{ */ +/** Profiling results. */ +typedef union +{ + /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME. */ + struct + { + unsigned int ticks; /**< Cumulative ticks the thread was ready. */ + } thread_ready_time; + +} qurt_profile_result_t; +/** @} */ /* end_addtogroup profiling_types */ + +/**@ingroup func_qurt_profile_enable2 + * Starts profiling of a specific parameter on a specific thread (as applicable). + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of the thread (if applicable) for which the specified + * paramter must be profiled. + * @param[in] enable #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- + * enable + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EALREADY -- Measurement already in progress or already stopped \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_enable2 ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + int enable +); + +/**@ingroup func_qurt_profile_get + * Gets the value of the profiling parameter that was previously enabled. + * + * @param[in] param Profiling parameter. + * @param[in] thread_id ID of thread (if applicable) for which the specified + * profiling paramter must be retrieved. + * @param [out] result Profiling result associated with the parameter for the specified + * thread (if applicable). + * + * @return + * #QURT_EOK -- Success \n + * #QURT_EFAILED -- Operation failed; profiling was not enabled \n + * #QURT_ENOTHREAD -- Thread does not exist \n + * #QURT_EINVALID -- Invalid profiling parameter \n + * + * @dependencies + * None. + */ +extern int qurt_profile_get ( + qurt_profile_param_t param, + qurt_thread_t thread_id, + qurt_profile_result_t * result +); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ptrace.h new file mode 100755 index 0000000000000..622304dd92865 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ptrace.h @@ -0,0 +1,37 @@ +/*============================================================================= + + qurt_ptrace.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2013 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef __SYS_PTRACE_H__ +#define __SYS_PTRACE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +enum __ptrace_request +{ + /** + Indicates that the process making this request is requesting to be traced. + */ + PTRACE_TRACEME = 0, + PTRACE_EXT_IS_DEBUG_PERMITTED = 500 +}; + +long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data); + +#ifdef __cplusplus +} +#endif + +#endif //__SYS_PTRACE_H__ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi.h new file mode 100755 index 0000000000000..705408e5cfc6f --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi.h @@ -0,0 +1,185 @@ +#ifndef QDI_H +#define QDI_H + +/** + @file qurt_qdi.h + @brief Prototypes of QuRT Driver Invocation API functions + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + + +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_qdi_open + Opens the specified driver for subsequent operations. + qurt_qdi_open() is the primary mechanism by which a driver user can + obtain a QDI handle. The user provides the name of the driver to the + qurt_qdi_open call, and gets back a handle referencing + the named driver. \n + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_open function prototype is not actually defined as a varargs. + + + @param[in] p Driver name. + @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, + and should follow the POSIX open() convention. \n + - flags -- Optional second parameter (POSIX flags), the handle + access requested (read-only, write-only, or read-write, + for instance) and other flags such as whether the call + should create a new device or only open an existing + device. \n + - mode -- Optional third parameter (POSIX mode); permissions to + configure when a new device is created. @tablebulletend + + @return + Negative value -- Error. \n + Non-negative value -- Success, this result value serves as a handle to the + opened driver. + @dependencies + None. + */ +// int qurt_qdi_open(); +#define qurt_qdi_open(p,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__) + +#define qurt_qdi_open_dt(p,q,...) \ + qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__) + +/**@ingroup func_qurt_qdi_handle_invoke + Performs a generic driver operation, which (depending on the specified operation) can be + either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} + or a driver-specific operation. + The user provides a QDI handle and an integer + method number, along with 0 to 8 optional 32-bit arguments. + The device driver invocation function is invoked with the + same method number and 0 to 8 optional arguments. The + return value from the invocation function is passed back to + the user as the return value of qurt_qdi_handle_invoke. + + @note1hang For reasons related to the Hexagon standard for varargs functions, the + qurt_qdi_handle_invoke() function prototype is not actually defined as a + varargs function (and would break if it were defined this way). + + @param[in] h Driver handle. + @param[in] m Integer number for the operation to perform. + @param[in] ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n + arg1 -- First parameter \n + arg2 -- Second parameter \n + arg3 -- Third parameter \n + arg4 -- Fourth parameter \n + arg5 -- Fifth parameter \n + arg6 -- Sixth parameter \n + arg7 -- Seventh parameter \n + arg8 -- Eighth parameter + + @return + Integer value defined by the device driver. \n + -1 -- Error. + + @dependencies + None. + */ +// int qurt_qdi_handle_invoke(); +#define qurt_qdi_handle_invoke(h,m,...) \ + _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__) +#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c) +#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d)) +#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e)) +#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) +int qurt_qdi_qhi3(int,int,int); +int qurt_qdi_qhi4(int,int,int,int); +int qurt_qdi_qhi5(int,int,int,int,int); +int qurt_qdi_qhi6(int,int,int,int,int,int); +int qurt_qdi_qhi7(int,int,int,int,int,int,int); +int qurt_qdi_qhi8(int,int,int,int,int,int,int,int); +int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int); +int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int); + +/**@ingroup func_qurt_qdi_write + Writes data to the specified driver. + A predefined invocation routine for drivers that + support a POSIX-like write functionality. + qqurt_qdi_write(handle, buf, len) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data to write is stored. + @param[in] len Number of bytes of data to write. + + @return + Non-negative integer -- Number of bytes written. \n + Negative error code -- Write could not take place. + + @dependencies + None. + */ +int qurt_qdi_write(int handle, const void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_read + User-visible API to read data from a QDI handle. + A predefined invocation routine for drivers that + support a POSIX-like read functionality. + qurt_qdi_read(handle, buf, len) is equivalent to: + qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len); + + @param[in] handle Driver handle. + @param[in] buf Pointer to the memory address where the data read is stored. + @param[in] len Number of bytes of data to read. + + @return + Non-negative integer number -- Bytes read. \n + Negative error code -- Read could not take place. + + @dependencies + None. + */ +int qurt_qdi_read(int handle, void *buf, unsigned len); + +/**@ingroup func_qurt_qdi_close + Closes the specified driver, releasing any resources associated with the open driver. + User-visible API to close a QDI handle. + + This API should be called when the user is done using a + QDI-based handle. When this function is called, the driver can release + any resources held and perform other necessary cleanup + operations. qurt_qdi_close(handle) is equivalent to + qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle) + + @param[in] handle Driver handle. + + @return + 0 -- Success.\n + Negative error code -- Failure. + + @dependencies + None. + */ +int qurt_qdi_close(int handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_constants.h new file mode 100755 index 0000000000000..4866fada067f0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_constants.h @@ -0,0 +1,193 @@ +#ifndef QDI_CONSTANTS_H +#define QDI_CONSTANTS_H + +/** + @file qurt_qdi_constants.h + @brief Predefined invocation methods for drivers. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2013-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc.. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Method numbers used for QDI. +|| +|| Intended grouping of method numbers for QDI +|| including future usage: +|| +|| Method 0 should always be unused and not responded to by +|| any driver. +|| Methods 1 and 2 are reserved for name registration and +|| name lookup. +|| Methods 3 through 31 are reserved for POSIX-type operations +|| on open handles. +|| Methods 32 through 127 are reserved for the QDI infrastructure +|| and may be extended in the future to provide standard +|| driver debug services, management services, and system +|| notifications. +|| Methods 128 through 255 are reserved for the use of automatically +|| generated methods such as might be generated by an IDL (interface +|| definition language). The infrastructure may be extended to +|| perform services on these methods based on information provided +|| by the IDL, such as automatic buffer validation, etc. These +|| method numbers should not be used for any "ad hoc" methods. +|| Methods with number >= 256 are "private" method numbers that are +|| outside the scope of the QDI infrastructure. Drivers that want +|| to generate and consume their own "ad hoc" methods are free to +|| use these method numbers as they wish. The infrastructure does +|| not generate these method numbers or respond to them, but +|| passes them on unmolested. +|| +|| All driver implementations *should* return a value of +|| -1 when called with an unsupported method. The standard error +|| return value for POSIX APIs is -1, so we emulate that behavior +|| here. +*/ +/** @cond */ +#define QDI_UNUSED 0 +#define QDI_DEVNAME_REGISTER 1 +#define QDI_OPEN 2 +#define QDI_CLOSE 3 +#define QDI_READ 4 +#define QDI_WRITE 5 +#define QDI_IOCTL 6 +#define QDI_MMAP 7 +#define QDI_OS_FILEOPEN 8 +#define QDI_FLEN 9 +#define QDI_UNLINK 10 +#define QDI_FTELL 22 +#define QDI_SEEK 23 +#define QDI_FSTAT 24 + +#define QDI_FSNAME_REGISTER 150 +#define QDI_FS_OPEN 151 +#define QDI_MMAP2 153 +#define QDI_MPROTECT2 154 +#define QDI_MUNMAP2 155 + +#define QDI_CLIENT_HANDLE_OBJREF_GET 10 + +#define QDI_OS_PROCESS_LOAD 12 +#define QDI_OS_PROCESS_CHOOSE_ASID 13 + +#define QDI_OS_SET_GP 26 +#define QDI_CLIENT_HANDLE_CALLBACK 27 + +#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T 19 //reused +#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80 +#define QDI_CLIENT_HANDLE_HANDLE_RELEASE 81 +#define QDI_CLIENT_HANDLE_COPY_FROM_USER 82 +#define QDI_CLIENT_HANDLE_COPY_TO_USER 83 +#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE 86 +#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS 87 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK 41 +#define QDI_CLIENT_HLOSPOOL_INFO_GET 90 +#define QDI_CLIENT_HLOSPOOL2_INFO_GET 96 + +#define QDI_CLIENT_PID 44 +#define QDI_CLIENT_ASID QDI_CLIENT_PID + +#define QDI_OS_CLIENT_INFO_GET 48 + +#define QDI_OS_MEM_LOOKUP_PHYSADDR 57 + +#define QDI_OS_THREAD_ITERATOR_CREATE 68 +#define QDI_OS_THREAD_ITERATOR_NEXT 69 + +#define QDI_OS_SYSENV 78 + +#define QDI_REGION_USERMALLOC_INIT 180 // This method is for generic handle + + +#define QDI_CLIENT_HANDLE_USER_MALLOC 84 +#define QDI_CLIENT_HANDLE_USER_FREE 85 + +#define QDI_SIGNAL_GROUP_SIGNAL_CREATE 96 +#define QDI_SIGNAL_GROUP_WAIT 98 +#define QDI_SIGNAL_GROUP_POLL 99 +#define QDI_SIGNAL_SET 96 +#define QDI_SIGNAL_CLEAR 97 +#define QDI_SIGNAL_WAIT 98 +#define QDI_SIGNAL_POLL 99 + +#define QDI_OS_WAIT_FOR_MAIN_REAPER 104 + +#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL 105 +#define QDI_CLIENT_HANDLE_REFPROXY_ADD 106 +#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE 107 + +#define QDI_CLIENT_HANDLE_DETACH 116 + +#define QDI_OS_RESERVED1 139 + +#define QDI_CLIENT_HANDLE_BUFFER_LOCK2 142 + +#define QDI_DT_REGISTER 158 +#define QDI_OPEN_DEVICE 159 +#define QDI_OPEN_FROM_DT 160 + +#define QDI_PRIVATE 256 /* Method numbers beginning at 256 + are private method numbers, which + are device-specific and available + for use by device implementors. */ +/* +|| Permission bitmasks for use with qurt_qdi_lock_buffer(). +|| +|| Make sure these match with permission values from qurt_perm_t. +*/ +/** @endcond */ + +/** @addtogroup driver_support_constants +@{ */ +#define QDI_PERM_W 2 /**< Write access. */ +#define QDI_PERM_R 1 /**< Read access. */ +#define QDI_PERM_RW (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */ + +#define QDI_HANDLE_LOCAL_CLIENT 3 /**< Local client. */ +#define QDI_HANDLE_GENERIC 4 /**< Generic. */ + +#define QDI_REFCNT_BASE 0x510000 /**< */ +#define QDI_REFCNT_MAXED 0x51FFFD /**< */ +#define QDI_REFCNT_INIT 0x51FFFE /**< Driver object is temporary and is eventually deleted.*/ +#define QDI_REFCNT_PERM 0x51FFFF /**< Driver object is permanent and is never deleted. */ +/** @} */ /* end_addtogroup driver_support_constants */ + +/** @cond */ +/* +|| Flags used by process loaders. +*/ + +#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT 0x1 /* Set this flag to request the loaded process + to have island residency. */ +#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT 0x2 /* Set this flag to request the loaded process + to have root residency, for example, DL Pager. */ +/* +|| Constants used for qurt_event register API, type field. +*/ + +#define QURT_PROCESS_EXIT 1 + +/* +|| Constants used by QDI extensions. +*/ + +#define QURT_QDI_SINGLETON_TYPE_TRUE 0 +#define QURT_QDI_SINGLETON_TYPE_FALSE 1 +#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS 2 +/** @endcond */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QDI_CONSTANTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_driver.h new file mode 100755 index 0000000000000..e044e25f1bb72 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_driver.h @@ -0,0 +1,868 @@ +#ifndef QURT_QDI_DRIVER_H +#define QURT_QDI_DRIVER_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver. + + EXTERNALIZED FUNCTIONS + None + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None + + Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc. + All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include "stddef.h" +#include "qurt_qdi.h" +#include "qurt_types.h" +#include "qurt_callback.h" +#include "qurt_qdi_constants.h" +#include "qurt_qdi_imacros.h" +#include "qurt_mutex.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| This gives the canonical form for the arguments to a QDI +|| driver invocation function. The arguments are as follows: +|| +|| int client_handle (R0) QDI handle that represents the client +|| that made this QDI request. If the +|| client is remote, this is a +|| variable handle; if the client is local +|| (same thread and process), this is +|| set to QDI_HANDLE_LOCAL_CLIENT. +|| +|| qurt_qdi_obj_t *obj (R1) Points at the qdi_object_t structure +|| on which this QDI request is being made. +|| The qdi_object_t structure is usually +|| the first element of a larger structure +|| that contains state associated with the +|| object; because it is usually the first +|| element, the object pointers can be freely +|| interchanged through casts. +|| +|| int method (R2) Integer QDI method that represents +|| the request type. +|| +|| qurt_qdi_arg_t arg1 (R3) First three general purpose arguments +|| qurt_qdi_arg_t arg2 (R4) to the invocation function are passed in +|| qurt_qdi_arg_t arg3 (R5) these slots. +|| +|| qurt_qdi_arg_t arg4 (SP+0) Arguments beyond the first three are +|| qurt_qdi_arg_t arg5 (SP+4) passed on the stack. +|| qurt_qdi_arg_t arg6 (SP+8) +|| qurt_qdi_arg_t arg7 (SP+12) +|| qurt_qdi_arg_t arg8 (SP+16) +|| qurt_qdi_arg_t arg9 (SP+20) +|| +|| The canonical form of the invocation function takes a +|| total of 12 arguments, but not all of them are used. In general, +|| the QDI infrastructure only passes those arguments provided by +|| the caller; if the invocation function accesses additional +|| arguments beyond those provided by the caller, the values are not +|| useful. +*/ +/** @cond */ +#define QDI_INVOKE_ARGS \ + int, struct qdiobj *, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define QDI_EXT_INVOKE_ARGS \ + int, qurt_qdi_man_obj_t*, int, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \ + qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t + +#define BUFFER_LOCK 1 +#define BUFFER_UNLOCK 0 + +struct qdiobj; +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef union { + void *ptr; /**< Pointer to the driver handle. */ + int num; /**< Method number. */ +} qurt_qdi_arg_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI driver version */ +typedef union { + int num; + struct { + short major; /** Driver major version number. */ + short minor; /** Driver minor version number. */ + }; +} qurt_qdi_version_t; + +typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS); +typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *); +/** @endcond */ +/** @addtogroup driver_support_types +@{ */ +typedef struct qdiobj { + qurt_qdi_pfn_invoke_t invoke; /**< Invocation function that implements the driver methods.*/ + int refcnt; /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of + references to a driver instance. */ + qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance + of the driver object.*/ +} qurt_qdi_obj_t; +/** @} */ /* end_addtogroup driver_support_types */ +/** @cond */ +/** QuRT QDI managed object */ +typedef struct qurt_qdi_man_obj +{ + qurt_qdi_obj_t qdi_obj; + union + { + struct qurt_qdi_ext_driver * opener_obj; + struct qurt_qdi_ext_device * device_obj; + }; +}qurt_qdi_man_obj_t; + +typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj); +typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS); +typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj); +typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device); + +typedef struct qurt_qdi_ext_obj_info{ + qurt_qdi_man_obj_t *obj; + int qdi_client_id; + struct qurt_qdi_ext_obj_info *next; +}qurt_qdi_ext_obj_info_t; +typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr; + +/** QuRT QDI device */ +//temporarily add this back while there are still drivers who statically define this structure +struct qurt_qdi_device { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; +}; +typedef struct qurt_qdi_device qurt_qdi_man_device; + +struct qurt_qdi_ext_driver { + qurt_qdi_obj_t opener_obj; + const char* name; + char island_resident; + unsigned char singleton; + qurt_qdi_ext_pfn_create_t create; + qurt_qdi_ext_pfn_invoke_t invoke; + qurt_qdi_ext_pfn_destroy_t destroy; + qurt_mutex_t qurt_qdi_ext_list_lock; + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + qurt_qdi_ext_pfn_create_device_t create_device; + qurt_qdi_version_t version; + qurt_qdi_ext_pfn_probe_t probe; + const char* compatible; + struct qurt_qdi_ext_device * device_list; + //qurt_qdi_ext_device_ptr device_list; +}; +typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t; +//above replaces qurt_qdi_man_device + +extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *); +extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *); + +extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS); +/** @endcond */ +/**@ingroup func_qurt_qdi_method_default + Processes a method that is unrecognized or unsupported in the driver invocation function. + All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded + to this function. + + @note1hang Invocation functions must process all unrecognized or unsupported methods + by calling this function. + + @return + None. + + @dependencies + None. +*/ +extern int qurt_qdi_method_default(QDI_INVOKE_ARGS); + +/**@ingroup func_qurt_qdi_handle_create_from_obj_t + Allocates a new device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[out] obj Pointer to the driver object. + + @return + Non-negative integer -- Success; this value is the new handle. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_invoke + Allocates a new island device handle for use with the specified driver object. + + @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1). + @param[in] obj Pointer. + + @return + Non-negative integer value that is the new handle -- Success. \n + Negative return value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T, + obj); +} + +/**@ingroup func_qurt_qdi_handle_release + Deallocates the specified device handle. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] handle_to_release Handle to release. + + @return + 0 -- Success. \n + Negative value -- Error. + + @dependencies + None. +*/ +static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_HANDLE_RELEASE, + handle_to_release); +} + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_handle(int client_handle, int object_handle) +{ + qurt_qdi_obj_t *ret; + + ret = NULL; + + qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_OBJREF_GET, + object_handle, + &ret); + + return ret; +} + +/**@ingroup func_qurt_client_add_memory + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size); + +/**@ingroup func_qurt_client_add_memory2 + Adds a physical address range to the HLOS physpool of the caller user PD. + + @param[in] client_handle Obtained from the current invocation function (Section 3.4.1). + @param[in] phys_addr Starting 36-bit address of the physical address range. + @param[in] size Size. + + @return + #QURT_EOK -- Pages successfully added. + + @dependencies + None. +*/ +int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size); + +static __inline qurt_qdi_obj_t * +qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr) +{ + qurt_qdi_obj_t * ret = NULL; + + if (qurt_qdi_obj_ref_inc(objptr) < 0) { + ret = NULL; + } else { + ret = objptr; + } + + return ret; +} + +static __inline void +qurt_qdi_objref_release(qurt_qdi_obj_t *objptr) +{ + if (qurt_qdi_obj_ref_dec(objptr) == 1) { + (*objptr->release)(objptr); + } +} + +/**@ingroup func_qurt_qdi_copy_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the driver buffer. + @param[in] src Base address of the user buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_FROM_USER, + dest, src, len); +} + +/**@ingroup qurt_qdi_copy_string_from_user + Copies the contents of a user memory buffer into the current driver. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param dest Base address of the driver buffer. + @param src Base address of the user buffer. + @param len Number of bytes to copy. NOTE: This is the destination buffer length. + + @return + Negative error result -- privilege or security violation, the copy operation + has crossed a privilege boundary. + + @dependencies + None. +*/ +int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len); + +/**@ingroup func_qurt_qdi_copy_to_user + Copies the contents of a driver memory buffer to user memory. + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] dest Base address of the user buffer. + @param[in] src Base address of the driver buffer. + @param[in] len Number of bytes to copy. + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_COPY_TO_USER, + dest, src, len); +} + +/**@ingroup func_qurt_qdi_safe_cache_ops + Do cache operations on user memory + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] addr Base address of the user memory. + @param[in] size Size of the user memory. + @param[in] opcode Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...) + @param[in] type Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE) + + @return + Negative value -- Indicates a privilege or security violation, the copy operation has crossed a + privilege boundary + + @dependencies + None. +*/ +static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size, + qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SAFE_CACHE_OPS, + addr, size, opcode, type); +} + + +/**@ingroup func_qurt_qdi_buffer_lock + Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI + client. + + This function is used to permit a trusted driver to safely access memory that is + provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + + This function performs the following security checks: \n + - Verifies that the entire buffer is accessible to the client. \n + - Ensures that the pointer remains valid for the remainder of the QDI driver + operation. \n + + @note1hang User buffer addresses are valid only for the duration of the current driver + invocation. + + @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param[in] buf Pointer to the base address of the client buffer address. + @param[in] len Buffer length (in bytes). + @param[in] perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + - #QDI_PERM_R -- Read access \n + - #QDI_PERM_W -- Write access \n + - #QDI_PERM_RW -- Read/write access @tablebulletend + @param[out] obuf Pointer to the buffer address that the driver must use to access the buffer. + + @return + Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n + Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission. + In this case the QDI driver call must be terminated cleanly, with an appropriate error code + returned to the client. \n + Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the + pointer passed in as buf -- even if the user process changes the mapping of memory at buf, + the mapping of memory at *obuf remains valid until the driver invocation completes. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK, + buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_lock2 + Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI + client. + This API permits a trusted driver to safely access memory + provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer + pointer for accessing the memory. + This function performs the following security checks: \n + -- Entire buffer is accessible to the client. \n + -- Entire buffer is mapped with permissions passed in perms field \n + -- Entire buffer is physically contiguous \n + In addition to the security checks, the API also locks the client mapping such that the client + cannot remove the mapping while the physical memory is used by the trusted + driver. \n + + @note1 Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not + pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client. If the client exits abruptly, the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Safe buffer + unmapping or user buffer unlock is not supported in Island mode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param perms Bitmask value that specifies the read or write access to perform on the + client buffer: \n + -- #QDI_PERM_R -- Read access \n + -- #QDI_PERM_W -- Write access \n + -- #QDI_PERM_RW -- Read/write access \n + @param obuf Optional parameter that returns a pointer to the buffer address that + the driver must use to access the buffer. If NULL is passed, the API + only performs security checks and does not create a mapping to access the user buffer in + a safe way. + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n + QURT_EFAILED -- Mapping cannot be created for the trusted driver. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len, + unsigned perms, void **obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_LOCK, buf, len, perms, obuf); +} + +/**@ingroup func_qurt_qdi_buffer_unlock + This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping + created for the driver is removed. Client mapping for the user buffer is + unlocked. + + @note1 Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not + pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and + process exit failures. Drivers can keep track of which buffers are locked for + a particular client, and if the client exits abruptly, all the buffers can be + unlocked on driver release invocation for the exiting client. + + @note2 This API is supported in limited capacity when called from Island mode. Actual + unmapping of driver accessible memory or unlocking of the buffer is not + supported in Island bode. + + @param client_handle Obtained from the current invocation function (Section 3.4.1). + @param buf Pointer to the base address of the client buffer address. + @param len Buffer length (in bytes). + @param obuf Safe buffer address that was returned in the obuf field after calling + qurt_qdi_buffer_lock2(). + + @return + QURT_EINVALID -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the + buffer is 0. \n + QURT_EOK -- Lock operation was successful. When this occurs, the QDI driver must use the + pointer at *obuf to perform its memory accesses, and not the + pointer passed in as buf. \n + other results -- Safe buffer unmapping failed or unlocking of user buffer failed \n. + + @dependencies + None. +*/ +static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len, + void *obuf) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_BUFFER_LOCK2, + BUFFER_UNLOCK, buf, len, obuf); +} + +/**@ingroup func_qurt_qdi_user_malloc + Allocates memory area in the QDI heap that is read/write accessible to both the driver and + the client. \n + @note1hang The QDI heap has a limited amount of memory available, and only the + device driver can free the allocated memory. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param size Size. + + @return + Non-zero -- Success; this returned value points to the allocated memory area. \n + Zero -- Error. + + @dependencies + None. +*/ +void *qurt_qdi_user_malloc(int client_handle, unsigned size); + +/**@ingroup func_qurt_qdi_user_free + Deallocates memory area in the QDI heap. + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param ptr Pointer. + + @dependencies + None. +*/ +void qurt_qdi_user_free(int client_handle, void *ptr); + +/**@ingroup funct_qurt_qdi_client_detach + Detaches a client (a process), indicating that the client does not + participate in the qurt_wait() mechanism. This behavior + is opt-in and irrevocable. When a client is detached, it can + not be un-detached. + + @param client_handle Handle of the client to detach. + + @return + Zero -- Success. Detachable clients always return success. + Nonzero value -- client_handle did not refer to a + detachable user client. + + @dependencies + None. +*/ +static __inline int qurt_qdi_client_detach(int client_handle) +{ + return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH); +} + +/**@ingroup func_qurt_qdi_signal_group_create + Creates a new signal group for use in a device driver. + A QDI signal group contains up to 32 signals, which can be operated on either + individually (using the qurt_qdi_signal_* functions) or as a group (using the + qurt_qdi_signal_group_* functions). \n + @note1hang Driver implementation is responsible for using the proper signal group + handle in any given situation. \n + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}). + @param p_signal_group_handle_local Returns a handle intended for use by code that + resides in the same context and process as the created signal group + (for example, the device driver implementation that allocated the + signal group). + @param p_signal_group_handle_remote Returns a handle intended for use by code + that resides in a different context and process than the created signal group + (for example, the user-mode client of an OS driver). + + @return + Zero return value indicates success.\n + Negative return value indicates could not create signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_create(int client_handle, + int *p_signal_group_handle_local, + int *p_signal_group_handle_remote) +{ + return qurt_qdi_handle_invoke(client_handle, + QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE, + p_signal_group_handle_local, + p_signal_group_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_group_wait + Suspends the current thread until any of the signals are set in the specified signal group. + + If a signal is set in a signal group object, and a thread waits on the signal group object, + the thread is awakened. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @param signal_group_handle Handle of the signal group. + + @return + If the client is remote: + QURT_EOK -- Wait complete \n + QURT_ECANCEL -- Wait cancelled.\n + If the client is local, returns a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_wait(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_group_poll + Returns a value that indicates if any of the signals are set in the specified signal group. + + @param signal_group_handle Handle of the signal group. + + @return + 1 -- Indicates whether any of the signals are set in the signal group.\n + 0 -- Indicates that none of the signals are set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_group_poll(int signal_group_handle) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_POLL); +} + + +/**@ingroup func_qurt_qdi_signal_create + Creates a new signal in the specified signal group. + For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78). + + @note1hang Driver implementation is responsible for using the proper signal handle in + any given situation. + + @param signal_group_handle Handle of an existing signal group. + @param p_signal_handle_local Returns a handle intended for use by code that resides in + the same context and process as the created signal (for example, + the device driver implementation that allocated the signal). + @param p_signal_handle_remote Returns a handle intended for use by code that resides in + a different context and process than the created signal (for + example, the user-mode client of an OS driver). + + @return + Nonzero value -- No more signals can be created in the specified + signal group. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_create(int signal_group_handle, + int *p_signal_handle_local, + int *p_signal_handle_remote) +{ + return qurt_qdi_handle_invoke(signal_group_handle, + QDI_SIGNAL_GROUP_SIGNAL_CREATE, + p_signal_handle_local, + p_signal_handle_remote); +} + +/**@ingroup func_qurt_qdi_signal_set + Sets the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_set(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_SET); +} + +/**@ingroup func_qurt_qdi_signal_clear + Clears the signal in the specified signal object. + + @param signal_handle Handle of the signal. + + @return + Always returns 0. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_clear(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_CLEAR); +} + +/**@ingroup func_qurt_qdi_signal_wait + Suspends the current thread until the specified signal is set. + If a signal is set in a signal object, and a thread waits on the signal object, the + thread is awakened. If the awakened thread has higher priority than the current thread, a + context switch may occur. + + @param signal_handle Handle of the signal. + + @return + If client is remote: + QURT_EOK -- Wait complete. \n + QURT_ECANCEL -- Wait cancelled.\n + If client is local, return a 32-bit word with current signals. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_wait(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_WAIT); +} + +/**@ingroup func_qurt_qdi_signal_poll + Returns a value that indicates if the specified signal is set. + + @param signal_handle Handle of the signal. + + @return + 1 -- Signal is set. \n + 0 -- Signal is not set. + + @dependencies + None. +*/ +static __inline int qurt_qdi_signal_poll(int signal_handle) +{ + return qurt_qdi_handle_invoke(signal_handle, + QDI_SIGNAL_POLL); +} + +/**@ingroup func_qurt_qdi_devname_register + Registers a QDI device with the generic QDI object in the + current QDI context. + + This function registers an exact name or a directory prefix with a QDI opener object. + Future invocations of qurt_qdi_open() in the context of the caller invokes the + opener object if a match is detected. + + Directory prefix names are specified by ending the name with a forward slash character. + + Example of an exact name: + @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode + + Example of a directory prefix: + @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode + + Given the two registrations shown above, the only qurt_qdi_open() requests to + direct to the foobar_opener object are requests for the exact name + "/dev/foobar", Any request beginning with "/pipedev/" is directed to the + pipedev_opener object. + + The pipedev invocation function presumably examines the name argument to + determine exactly how to handle the request. The name is passed to the invocation + function in the a1.ptr argument (Section @xref{sec:invocationFunction}). + + @param name Device name or device name prefix. + @param opener Pointer to the opener object for the device. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. + + @dependencies + None. + */ +static __inline int qurt_qdi_devname_register(const char *name, + qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, + QDI_DEVNAME_REGISTER, + name, + opener); +} + +// Macros for backward compatibility with deprecated APIs +// (These will go away soon) + +#define qurt_qdi_register_devname(name, opener) \ + qurt_qdi_devname_register((name), (void *)(opener)) +#define qurt_qdi_new_handle_from_obj_t(handle, obj) \ + qurt_qdi_handle_create_from_obj_t((handle), (obj)) +#define qurt_qdi_release_handle(client_handle, handle) \ + qurt_qdi_handle_release((client_handle), (handle)) +#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \ + qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf)) +#define qurt_qdi_usermalloc(handle, size) \ + qurt_qdi_user_malloc((handle), (size)) +#define qurt_qdi_userfree(handle, ptr) \ + qurt_qdi_user_free((handle), (ptr)) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_ext.h new file mode 100755 index 0000000000000..383e1799a15d6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_ext.h @@ -0,0 +1,58 @@ +#ifndef QURT_QDI_EXT_H +#define QURT_QDI_EXT_H + +/** + @file qurt_qdi_driver.h + @brief Definitions, macros, and prototypes used when writing a + QDI driver + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct qurt_qdi_ext_device { + qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head; + struct qurt_qdi_ext_device * next; + char * instance; + fdt_node_handle context; +}; +typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr; + +/**@ingroup func_qurt_qdi_dt_register + Registers a QDI device with the generic QDI object in the current QDI context, + if and only if a compatible device node is found in the device tree. This + function serves as a device tree aware wrapper for qurt_qdi_devname_register(). + + @param name Device name or device name prefix. + @param opener Pointer to QDI ext specialized opener object for the driver. + + @return + 0 -- Device was successfully registered. \n + Negative error code -- Device was not registered. +*/ +static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener); +} + +static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name) +{ + device->instance = name; +} + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_imacros.h new file mode 100755 index 0000000000000..c0a8448ac87f8 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_imacros.h @@ -0,0 +1,34 @@ +#ifndef QURT_QDI_IMACROS_H +#define QURT_QDI_IMACROS_H + +/** + @file qurt_qdi_imacros.h + @brief Internal macros used for QDI. Mostly consists of tricky (and ugly) + preprocessor hacks that permit us to do varargs function invocations + where we pass optional arguments in registers and where we can do + type casting and checking automatically. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define _QDMPASTE(a,b) _QDMPASTE_(a,b) +#define _QDMPASTE_(a,b) a##b +#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0) +#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_proxy.h new file mode 100755 index 0000000000000..f1d8992ea8811 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_proxy.h @@ -0,0 +1,55 @@ +/*============================================================================= + + qurt_qdi_proxy.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef _QURT_QDI_PROXY_H +#define _QURT_QDI_PROXY_H + +#include "qurt_qdi_driver.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* APIs allowing operation on the proxy object directly */ +int qurt_qdi_proxy_ref_create(void); + +/* APIs allowing to operate on proxy given a known proxy handle + * 1) using qdi handle of the object + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle); +int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle); + +/* 2) using object reference + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); +int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr); + +/* API allowing to associate a proxy object with a particular client given a client handle + * successfule return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_install (int client_handle, int proxy_handle); + +/* APIs allowing operation on proxy object from user client + * successful return: QURT_EOK, anything else -- failure + */ +int qurt_client_proxy_ref_add(int qdi_handle); +int qurt_client_proxy_ref_remove(int qdi_handle); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_QDI_PROXY_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex.h new file mode 100755 index 0000000000000..a013a0bbddb1d --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex.h @@ -0,0 +1,200 @@ +#ifndef QURT_RMUTEX_H +#define QURT_RMUTEX_H +/** + @file qurt_rmutex.h + Prototypes of rmutex API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup func_qurt_rmutex_init + Initializes a recursive mutex object. + The recursive mutex is initialized in unlocked state. + + @datatypes + #qurt_mutex_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex_init(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_destroy + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_destroy(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex protects, and continues executing. + + If a thread performs a lock operation on a mutex that is already use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_lock_timed + Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n + + If a thread performs a lock operation on a mutex that is not in use, the thread + gains access to the shared resource that the mutex is protecting, and continues executing. + + If a thread performs a lock operation on a mutex that is already in use by another + thread, the thread is suspended. When the mutex becomes available again (because the + other thread has unlocked it), the thread is awakened and given access to the shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked by itself. However, the mutex does not become available to other threads until the + thread performs a balanced number of unlocks on the mutex. + If timeout expires, this wait must be terminated and no access to the mutex is granted. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + + */ +int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration); + +/**@ingroup func_qurt_rmutex_unlock + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a mutex. When the mutex is + unlocked, the thread waiting on the mutex awakens. If the awakened + thread has higher priority than the current thread, a context switch occurs. + + @note1hang When a thread unlocks a recursive mutex, the mutex is not available until + the balanced number of locks and unlocks has been performed on the mutex. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex_unlock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock + Attempts to lock the specified recursive mutex.\n + + If a thread performs a try_lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that is protected by the mutex, and continues + executing.\n + If a thread performs a try_lock operation on a recursive mutex that another thread has + already locked, qurt_rmutex_try_lock immediately returns with a nonzero result + value. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex_try_lock(qurt_mutex_t *lock); + +/**@ingroup func_qurt_rmutex_try_lock_block_once + Attempts to lock a mutex object recursively. If the mutex is available, + it locks the mutex. If the mutex is held by the current thread, + it increases the internal counter and returns 0. If not, it returns a + nonzero value. + If the mutex is already locked by another thread, the caller thread is + suspended. When the mutex becomes available again (because the other + thread has unlocked it), the caller thread is awakened and tries to lock + the mutex; and if it fails, this function returns failure with a nonzero + value. If it succeeds, this function returns success with zero. + + @datatypes + #qurt_mutex_t + + @param[in] lock Pointer to the qurt_mutex_t object. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + @dependencies + None. + */ +int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex2.h new file mode 100755 index 0000000000000..a37e7e4458c4b --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex2.h @@ -0,0 +1,183 @@ +#ifndef QURT_RMUTEX2_H +#define QURT_RMUTEX2_H +/** + @file qurt_rmutex2.h + @brief Prototypes of rmutex2 API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup mutex_types +@{ */ +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT rmutex2 type. + Mutex type used with rmutex2 APIs. + */ +typedef struct { + /** @cond */ + unsigned int holder __attribute__((aligned(8))); /* UGP value of the mutex holder. */ + unsigned short waiters; /* Number of waiting threads. */ + unsigned short refs; /* Number of references to this mutex. */ + unsigned int queue; /* Kernel-maintained futex queuevalue. */ + unsigned int excess_locks; /* Number of excess times the holder has locked the mutex. */ + /** @endcond */ +} qurt_rmutex2_t; +/** @} */ /* end_addtogroup mutex_types */ +/** @cond internal_only*/ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_rmutex2_init + + @deprecated use #qurt_rmutex_init instead. + + Initializes a recursive mutex object. + + The recursive mutex is initially unlocked. + + Objects of type rmutex2 solve a potential race condition between + unlock() and destroy() operations. + + @datatypes + #qurt_rmutex2_t + + @param[out] lock Pointer to the recursive mutex object. + + @return + None. + + @dependencies + None. + */ +void qurt_rmutex2_init(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_destroy + + @deprecated use #qurt_rmutex_destroy instead. + + Destroys the specified recursive mutex. \n + @note1hang Recursive mutexes must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont In general, application code must destroy an rmutex2 object prior to + deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures + that all qurt_rmutex2_unlock() calls complete. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to destroy. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_destroy(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_lock + + @deprecated use #qurt_rmutex_lock instead. + + Locks the specified recursive mutex. \n + + If a thread performs a lock operation on a recursive mutex that is not in use, the + thread gains access to the shared resource that the mutex protects, and continues + to execute. + + If a thread performs a lock operation on a recursive mutex that another thread is using, + the thread is suspended. When the mutex becomes available again + (because the other thread has unlocked it), the thread is awakened and given access to the + shared resource. + + @note1hang A thread is not suspended if it locks a recursive mutex that it has already + locked, but the mutex does not become available until the thread performs a + balanced number of unlocks on the mutex. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_lock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_unlock + + @deprecated use #qurt_rmutex_unlock instead. + + Unlocks the specified recursive mutex. \n + More than one thread can be suspended on a recursive mutex. When the mutex is + unlocked, only the highest-priority thread waiting on the mutex awakens. If the + awakened thread has higher priority than the current thread, a context switch occurs. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to unlock. + + @return + None. + + @dependencies + None. + + */ +void qurt_rmutex2_unlock(qurt_rmutex2_t *lock); + +/**@ingroup func_qurt_rmutex2_try_lock + + @deprecated use #qurt_rmutex_try_lock instead. + + Attempts to lock the specified recursive mutex.\n + + Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() + succeeds immediately, this function behaves similarly, returning 0 for success. + When a call to qurt_rmutex2_lock() does not succeed immediately, this function has + no effect and returns nonzero for failure. + + @datatypes + #qurt_rmutex2_t + + @param[in] lock Pointer to the recursive mutex object to lock. + + @return + 0 -- Success. \n + Nonzero -- Failure. + + */ +int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_RMUTEX2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sclk.h new file mode 100755 index 0000000000000..a83cf5f1db889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sclk.h @@ -0,0 +1,145 @@ +#ifndef QURT_SCLK_H +#define QURT_SCLK_H +/** + @file qurt_sclk.h + @brief Header file describing the APIs supported by QuRT system SCLK + feature. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + + +/*============================================================================= + + INCLUDE FILES + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + + +/** + Conversion from microseconds to sleep ticks. + */ +#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL) +#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL) +#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL) +#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS +/** + Sleep timer error margin for Qtimer is 192 ticks ~10 us. +*/ +#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq; +#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN + +/*============================================================================= + + DATA DECLARATIONS + +=============================================================================*/ + +/**@ingroup func_qurt_sysclock_get_hw_ticks + @xreflabel{sec:qurt_sysclock_get_hw_ticks} + Gets the hardware tick count.\n + Returns the current value of a 64-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation must be used with care because of the wrap-around behavior. + + @return + Integer -- Current value of 64-bit hardware counter. + + @dependencies + None. + */ +unsigned long long qurt_sysclock_get_hw_ticks (void); + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_32 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_32} + Gets the hardware tick count in 32 bits.\n + Returns the current value of a 32-bit hardware counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 32 bits are the lower 32 bits of the Qtimer counter. + + @return + Integer -- Current value of the 32-bit timer counter. + + @dependencies + None. + */ +static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void) +{ + //Beginning with v61 there is a HW register that can be read directly. + unsigned long count; + __asm__ __volatile__ (" %0 = c30 " : "=r"(count)); + return count; +} + + +/**@ingroup func_qurt_sysclock_get_hw_ticks_16 + @xreflabel{sec:qurt_sysclock_get_hw_ticks_16} + Gets the hardware tick count in 16 bits.\n + Returns the current value of a 16-bit timer counter. The value wraps around to zero + when it exceeds the maximum value. + + @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program. + The returned 16 bits are based on the value of the lower 32 bits in Qtimer + counter, right shifted by 16 bits. + + @return + Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the + Qtimer counter, right shifted by 16 bits. + + @dependencies + None. + */ + + +static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void) +{ + unsigned long ticks; + + //Beginning with v61 there is a HW register that can be read directly. + __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks)); + __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks)); + + return (unsigned short)ticks; +} +unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks); +#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks) + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif /* __cplusplus */ + +#endif /* QURT_SCLK_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_secure_proc.h new file mode 100755 index 0000000000000..f40c7deb9bca1 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_secure_proc.h @@ -0,0 +1,53 @@ +#ifndef QURT_SECURE_PROC_H +#define QURT_SECURE_PROC_H + +/** + @file qurt_secure_proc.h + @brief Definitions, macros, and prototypes used for handling secure process + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2015, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_process_migrate_secure_process + Migrate the user process to Qurt secure process + + @param secure_phy_address Physical starting address of secure memory + @param secure_memory_size Size of secure memory + @param entry Entry function to secure process + + @return + EOK + Negative return value -- Error. + + @dependencies + None. +*/ +int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size, void entry(unsigned)); + +/**@ingroup qurt_process_get_migration_mem_size + get the size of all writable memory regions in a user PD. This is for preparation on secure process migration. + + @return + size of all writable memory regions in a user PD. + + @dependencies + None. +*/ +int qurt_process_get_migration_mem_size(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sem.h new file mode 100755 index 0000000000000..ee5ce4b2d94ab --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sem.h @@ -0,0 +1,252 @@ +#ifndef QURT_SEM_H +#define QURT_SEM_H +/** + @file qurt_sem.h + Prototypes of semaphore API. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup semaphore_types +@{ */ + +/** QuRT semaphore type. */ +typedef union { + /** @cond */ + unsigned int raw[2] __attribute__((aligned(8))); + struct { + unsigned short val; /**< */ + unsigned short n_waiting; /**< */ + unsigned int reserved1; /**< */ + unsigned int queue; /**< */ + unsigned int reserved2; /**< */ + }X; /** @endcond */ +} qurt_sem_t; +/** @} */ /* end_addtogroup semaphore_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_sem_add + Releases access to a shared resource (the specified amount increments the semaphore count value).\n + When a thread performs an add operation on a semaphore, the specified value increments the semaphore count. + The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing. \n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel repeatedly awakens the highest-priority waiting thread and decrements + the semaphore count value until either no waiting threads remain or the + semaphore count value is zero. If any of the awakened threads has higher priority + than the current thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] amt Amount to increment the semaphore count value. + + @return + Unused integer value. + + @dependencies + None. + + */ +int qurt_sem_add(qurt_sem_t *sem, unsigned int amt); + +/**@ingroup func_qurt_sem_up + Releases access to a shared resource. When a thread performs an up operation on a semaphore, + the semaphore count value increments. The result depends on the number of threads waiting + on the semaphore: \n + - When no threads are waiting, the current thread releases access to the shared resource + and continues executing.\n + - When one or more threads are waiting and the semaphore count value is nonzero, + the kernel awakens the highest-priority waiting thread and decrements the + semaphore count value. If the awakened thread has higher priority than the current + thread, a context switch can occur. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); } + +/**@ingroup func_qurt_sem_down + Requests access to a shared resource. When a thread performs a down operation on a + semaphore, the result depends on the semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Unused integer value. + + @dependencies + None. + */ +int qurt_sem_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_down_timed + When a thread performs a down operation on a semaphore, the result depends on the + semaphore count value: \n + - When the count value is nonzero, it is decremented, and the thread gains access to the + shared resource and continues executing.\n + - When the count value is zero, it is not decremented, and the thread is suspended on the + semaphore. When the count value becomes nonzero (because another thread + released the semaphore) it is decremented, and the suspended thread is awakened + and gains access to the shared resource. Terminate the wait when the specified timeout expires. + If timeout expires, terminate this wait and grant no access to the shared resource. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION + + @return + #QURT_EOK -- Success \n + #QURT_ETIMEDOUT -- Timeout + + @dependencies + None. + */ +int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration); + +/**@ingroup func_qurt_sem_try_down + @xreflabel{hdr:qurt_sem_try_down} + Requests access to a shared resource (without suspend). When a thread performs a try down + operation on a semaphore, the result depends on the semaphore count value: \n + - The count value is decremented when it is nonzero. The down operation returns 0 as + the function result, and the thread gains access to the shared resource and is free to + continue executing.\n + - The count value is not decremented when it is zero. The down operation returns -1 + as the function result, and the thread does not gain access to the shared resource + and should not continue executing. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + 0 -- Success. \n + -1 -- Failure. + + @dependencies + None. + + */ +int qurt_sem_try_down(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init + Initializes a semaphore object. + The default initial value of the semaphore count value is 1. + + @param[out] sem Pointer to the initialized semaphore object. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_destroy + Destroys the specified semaphore.\n + @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do + this causes resource leaks in the QuRT kernel.\n + @note1cont Semaphores must not be destroyed while they are still in use. If this occur, + the behavior of QuRT is undefined. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to destroy. + + @return + None. + + @dependencies + None. + */ +void qurt_sem_destroy(qurt_sem_t *sem); + +/**@ingroup func_qurt_sem_init_val + Initializes a semaphore object with the specified value. + + @datatypes + #qurt_sem_t + + @param[out] sem Pointer to the initialized semaphore object. + @param[in] val Initial value of the semaphore count value. + + @return + None. + + @dependencies + None. + + */ +void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val); + +/**@ingroup func_qurt_sem_get_val + Gets the semaphore count value.\n + Returns the current count value of the specified semaphore. + + @datatypes + #qurt_sem_t + + @param[in] sem Pointer to the semaphore object to access. + + @return + Integer semaphore count value + + @dependencies + None. + */ +static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;} +int qurt_sem_down_cancellable(qurt_sem_t *sem); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SEM_H */ + diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_shmem.h new file mode 100755 index 0000000000000..980557323708a --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_shmem.h @@ -0,0 +1,89 @@ +#ifndef QURT_SHMEM_H +#define QURT_SHMEM_H + +/** + @file qurt_shmem.h + + @brief + Prototypes of QuRT inter-process shared memory APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MODE_T +#define MODE_T +typedef unsigned int mode_t; +#endif //MODE_T + +/** + * The shm_open() function establishes a connection between a shared memory object and a file descriptor. + * The file descriptor is used by other functions such as mmap() to refer to that shared memory object. + * + * + * @param name Pointer to string naming a shared memory object. Name has to start with "/shm/" + * @param oflag File status flags and file access modes of the open file description. Following + * flags are defined in and supported: + * O_RDONLY: oepn for read access only + * O_RDWR: Open for read or write access + * O_CREAT: If shared memory object doesn't exist, create one. + * @param mode Permission flags (currently ignored) + * + * @return file descriptor (positive number) if operation successful. + * negative error code if failed + * +*/ + +int shm_open(const char * name, int oflag, mode_t mode); + +/** + * The shm_mmap() function create a shared memory mapping in the virtual address space of the + * the calling process. + * + * @param addr The starting address for the new mapping is specified in addr. + * @param len Specifies the lengh of the shared memory region. + * @param prot Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX. + * @param flags Determines whether updates to the mapping is visible or not to other process. Same as + * the one in mmap of POSIX. + * @param fd The starting adddress for the new mapping is returned. + * @param offset unused. + * + * @return The starting adddress for the new mapping is returned. + * negative error code if failed + * +*/ + +void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset); + +/** + * The shm_close() function removes a connection between a shared memory object and a file descriptor. + * If there is no file descriptor connects to the shared memory object, the shared memory object will + * be deleted automatically. Shared memory object has same virtual address in any process. This is + * restriction of single virtual address space. + * + * + * @param fd File descriptor of shared memory object + * + * @return 0 if operation successful. + * negative error code if failed + * +*/ + + +int shm_close(int fd); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal.h new file mode 100755 index 0000000000000..3a89c53394ad5 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal.h @@ -0,0 +1,518 @@ +#ifndef QURT_SIGNAL_H +#define QURT_SIGNAL_H + +/** + @file qurt_signal.h + @brief Prototypes of kernel signal API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @addtogroup signals_types +@{ */ +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 /**< Wait any. */ +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 /**< Wait all. */ + +/*===================================================================== + Typedefs + ======================================================================*/ + + +/** QuRT signal type. + */ +typedef union { + /** @cond */ + unsigned long long int raw; + struct { + unsigned int signals; + unsigned int waiting; + unsigned int queue; + unsigned int attribute; + }X; + /** @endcond */ +} qurt_signal_t; + + +/** QuRT 64-bit signal type. + */ +typedef struct { + /** @cond */ + qurt_signal_t signal_sum; + unsigned long long signals; + unsigned long long waiting; + /** @endcond */ +} qurt_signal_64_t; +/** @} */ /* end_addtogroup signals_types */ +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal_init + Initializes a signal object. + Signal returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_init(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_destroy + Destroys the specified signal object. + + @note1hang Signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_destroy(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting on the signal. + + If a thread is waiting on a signal object for any of the specified set of signals to set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared when the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread waits to set any of the signals, or to set all of + them. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_timed + @xreflabel{hdr:qurt_signal_wait} + Suspends the current thread until the specified signals are set or until timeout. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + waiting on a signal, and 0 indicates not waiting. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + The specified set of signals can be cleared after the signal is set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value that identifies the individual signals in the signal object to wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] signals Bitmask of signals that are set + @param[in] duration Duration (microseconds) to wait. Must be in the range + [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION] + + @return + #QURT_EOK -- Success; one or more signals were set \n + #QURT_ETIMEDOUT -- Timed-out \n + #QURT_EINVALID -- Duration out of range + + @dependencies + Timed-waiting support in the kernel. +*/ +/* ======================================================================*/ +int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, unsigned int *signals, unsigned long long int duration); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_any + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on the thread. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, + and one or more of those signals is set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_wait_all + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to wait on a signal, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask) +{ + return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + to set the signal, and 0 indicates not to set it. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_set(qurt_signal_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 32-bit word with current signals + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal_get(qurt_signal_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_clear + Clear signals in the specified signal object. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal_wait_cancellable + @xreflabel{hdr:qurt_signal_wait_cancellable} + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +/* ======================================================================*/ +int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, + unsigned int attribute, + unsigned int *return_mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_init + Initializes a 64-bit signal object.\n + The signal argument returns the initialized object. + The signal object is initially cleared. + + @note1hang Each signal-based object has one or more kernel resources associated with it; + to prevent resource leaks, call qurt_signal_destroy() + when this object is not used anymore. + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the initialized object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_init(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_destroy + Destroys the specified signal object. + + @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure + to do this causes resource leaks in the QuRT kernel.\n + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_destroy(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_wait + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not wait on it. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, + and all of those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value, which identifies the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. \n + @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_set + Sets signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set it. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifiying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_get + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal_64_t + + @param[in] *signal Pointer to the signal object to access. + + @return + A 64-bit double word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal_64_clear + Clears signals in the specified signal object. + + Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear it. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal_64_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask); + +#ifdef __cplusplus +} +#endif + +#endif /* QURT_SIGNAL_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal2.h new file mode 100755 index 0000000000000..43975100cbf75 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal2.h @@ -0,0 +1,340 @@ +#ifndef QURT_SIGNAL2_H +#define QURT_SIGNAL2_H + +/** + @file qurt_signal2.h + @brief Prototypes of kernel signal2 API functions. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000 +#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001 + +/*===================================================================== + Typedefs + ======================================================================*/ + +/** @addtogroup signals2_types +@{ */ +/** qurt_signal2 type. + */ +typedef union { + /** @cond */ + struct{ + unsigned int cur_mask; /* Current set of signal bits that are set. */ + unsigned int sig_state; /* Current state. */ + /* Bit 0 -- in anysignal wait. */ + /* Bit 1 -- in allsignal wait. */ + /* Bit 2 -- in interrupt wait. */ + /* Bits 31-3 -- reference count field. */ + unsigned int queue; /* Kernel-maintained futex queue value. */ + unsigned int wait_mask; /* When sig_state indicates a waiter is present, this is the wait mask. */ + }; + unsigned long long int raw; + /** @endcond */ +} qurt_signal2_t; +/* @} */ /* end_addtogroup signals2_types */ + +/*===================================================================== + Functions +======================================================================*/ + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_init + + @deprecated use #qurt_signal_init instead. + + Initializes a signal2 object. + Signal returns the initialized object. + The signal object is initially cleared. + + Objects of type signal2 solve a potential race condition between + set() and destroy() operations. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the initialized object. + + @return + None. + + @dependencies + Each mutex-based object has an associated + kernel resource(s), therefore users must call qurt_signal2_destroy() + when this object no longer in use. + */ +/* ======================================================================*/ +void qurt_signal2_init(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_destroy + + @deprecated use #qurt_signal_destroy instead. + + Destroys the specified signal object. + + @note1cont Signal objects must not be destroyed while they are still in use. If this + occurs, the behavior of QuRT is undefined. + @note1cont Application code should destroy a signal2 object prior to deallocating it. + Calling qurt_signal2_destroy() before deallocating a + signal2 object ensures completion of all qurt_signal2_set() calls. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to destroy. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_destroy(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait + + @deprecated use #qurt_signal_wait instead. + + Suspends the current thread until the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when + any of the signals specified in the mask are set. + + If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only + when all the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to wait on. + @param[in] attribute Specifies whether the thread waits for any of the signals to be set, or for all of + them to be set. Values:\n + - QURT_SIGNAL_ATTR_WAIT_ANY \n + - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @return + A 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask, + unsigned int attribute); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_any + + @deprecated use #qurt_signal_wait_any instead. + + Suspends the current thread until any of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened when any of the signals specified in the mask are set. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_wait_all + + @deprecated use #qurt_signal_wait_all instead. + + Suspends the current thread until all of the specified signals are set. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + a signal to wait on. + + The thread will be awakened only when all the signals specified in the mask are set. + + @note1hang At most one thread can wait on a signal object at any given time. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask) +{ + return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL); +} + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_set + + @deprecated use #qurt_signal_set instead. + + Sets signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be set, and 0 indicates not to set the signal. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to set in the signal + object. + + @return + None. + + @dependencies + None. +*/ +/* ======================================================================*/ +void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_get + + @deprecated use #qurt_signal_get instead. + + Gets a signal from a signal object. + + Returns the current signal values of the specified signal object. + + @datatypes + #qurt_signal2_t + + @param[in] *signal Pointer to the signal object to access. + + @return + 32-bit word with current signals. + + @dependencies + None. +*/ +/* ======================================================================*/ +unsigned int qurt_signal2_get(qurt_signal2_t *signal); + +/*======================================================================*/ +/**@ingroup func_qurt_signal2_clear + + @deprecated use #qurt_signal_clear instead. + + Clear signals in the specified signal object. + + Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 + indicates that a signal must be cleared, and 0 indicates not to clear the signal. + + @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait + operations do not automatically clear them. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to modify. + @param[in] mask Mask value identifying the individual signals to clear in the signal object. + + @return + None. + + @dependencies + None. + */ +/* ======================================================================*/ +void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_signal2_wait_cancellable + + @deprecated use #qurt_signal_wait_cancellable instead. + + Suspends the current thread until either the specified signals are set or the wait operation is cancelled. + The operation is cancelled if the user process of the calling thread is killed, or if the calling thread + must finish its current QDI invocation and return to user space. + + Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates + that a signal must be waited on, and 0 indicates not to wait on it. + + If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or + more of those signals is set in the signal object, the thread is awakened. + + If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of + those signals are set in the signal object, the thread is awakened. + + @note1hang At most, one thread can wait on a signal object at any given time. + + @note1cont When the operation is cancelled, the caller must assume that the signal is never set. + + @datatypes + #qurt_signal2_t + + @param[in] signal Pointer to the signal object to wait on. + @param[in] mask Mask value identifying the individual signals in the signal object to + wait on. + @param[in] attribute Indicates whether the thread must wait until any of the signals are set, or until all of + them are set. Values:\n + - #QURT_SIGNAL_ATTR_WAIT_ANY \n + - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend + @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function. + + + @return + #QURT_EOK -- Wait completed. \n + #QURT_ECANCEL -- Wait cancelled. + + + @dependencies + None. +*/ +int qurt_signal2_wait_cancellable(qurt_signal2_t *signal, + unsigned int mask, + unsigned int attribute, + unsigned int *p_returnmask); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SIGNAL2_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_space.h new file mode 100755 index 0000000000000..2c3f9e4496697 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_space.h @@ -0,0 +1,230 @@ +#ifndef QURT_SPACE_H +#define QURT_SPACE_H +/** + @file qurt_space.h + @brief Prototypes of QuRT process control APIs + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** This flag is a request to the OS to suspend the processes just before calling main() +But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */ +#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP + +/** + * Creates and starts a process from ELF of a specified name. The slash symbols + * "/" or "\" are ignored. Do not include the directory name in the input. This function + * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags. + * + * @param name ELF name of the executable. Name shall not contain directories, + * use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf" + * + * @param return + Process ID -- Success \n + Negative error code -- failure\n + #QURT_EPRIVILEGE -- Caller does not have enough privilege for this operation\n + #QURT_EMEM -- Not enough memory to perform the operation \n + #QURT_EFAILED -- Operation failed \n + #QURT_ENOTALLOWED -- Operation not allowed \n + #QURT_ENOREGISTERED -- Not registered \n + #QURT_ENORESOURCE -- Resource exhaustion \n + #QURT_EINVALID -- Invalid argument value +*/ + +int qurt_spawn_flags(const char * name, int flags); + +/** + Creates and starts a process from an ELF of the specified name. The slash symbols + "/" or "\" are ignored. Do not include the directory name in the input. + + @param name ELF name of the executable. Name shall not contain directories, + use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf". + + @return + Process ID -- Success. \m + Negative error code -- Failure. + +*/ +static inline int qurt_spawn(const char *name) +{ + return qurt_spawn_flags(name,0); +} + +/** + * Returns the process ID of the current process. + * + * @return + * Process ID + * +*/ +#define qurt_getpid qurt_process_get_id + +/** + * The qurt_wait() function waits for status change in a child process. It could be used by parent + * process to block on any child process terminates. + * + * This API returns error if there are no user processes or all user processes got detached. + * + * @param status Pointer to status variable. The variable provides the status value of child process. + * The value comes from exit() system call made by child process. + * + * @return + Process ID of the child process that changes status -- Success \n + * Negative error code -- Failure + * +*/ + +int qurt_wait(int *status); + + +/** @cond */ +/* APIs that allow registering callbacks on spawn of user pd */ +typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr); //no return, since we won't be error checking it in spawn +typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info); +typedef union { + QURT_SPAWN_PFN spawn_pfn; + QURT_CB_PFN cb_pfn; +} qurt_process_callback_pfn_t; +/** @endcond */ + +/** @cond internal_only */ + +/**@ingroup func_qurt_event_register +Sets the specified bits by mask in the signal passed by the caller. The signal gets set +when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal +to be set. + +@datatypes + +@param[in] type QURT_PROCESS_EXIT is the only event that can be registered for. +@param[in] value Indicates the client handle of the process for which the event is registered. +@param[in] signal Pointer to the signal object to set when the event occurs. +@param[in] mask Mask bits to set in the signal. +@param[out] data Pointer to the variable that would receive the exit code of the exiting process. +@param[in] datasize Size of the data variable. + +@return +#QURT_EOK -- Success \n +#QURT_EMEM -- Not enough memory to allocate resources \n +#QURT_EVAL -- Invalid values passed to the API + +@dependencies +None. +*/ +int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size); + +/**@ingroup func_qurt_callback_register_onspawn +Allows registering for a callback on spawn of any user process. + +@datatypes +#QURT_SPAWN_PFN + +@param[in] pFn Callback function to call when any user process is spawned. +@param[in] user_data Pointer to the argument that the callback must be called with. + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data); + +/**@ingroup func_qurt_callback_deregister_onspawn +Allows de-registering callback on spawn. + +@param[in] callback_handle Handle returned by qurt_callback_register_onspawn. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_callback_deregister_onspawn(int callback_handle); + +/**@ingroup func_qurt_process_callback_register +Allows registering for a callback during or after image loading. +Generic callback types: + Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is + loaded, before process thread starts. Callback has no return value and has no info provided + from OS. + pFn - QURT_SPAWN_PFN + type - QURT_PROCESS_CB_GENERIC + arg1 - not used + arg2 - not used + arg3 - not used +Note callback types: + Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP), + or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info + argument in callback is populated with pointer to the mapped note corresponding to the callback. + Callback has return value, loader fails if callback returns a value that is not QURT_EOK. + pFn - QURT_CB_PFN + type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP + arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO) + arg2 - note name + arg3 - not used + +@datatypes + +@param[in] pFn Callback function to call +@param[in] type Callback type +@param[in] user_data Pointer to the argument that the callback must be called with. +@param[in] arg1 Arguments interpreted by OS based on callback type +@param[in] arg2 Arguments interpreted by OS based on callback type +@param[in] arg3 Arguments interpreted by OS based on callback type (currently not used) + + +@return If positive value is obtained, handle to be used while deregistering the callback. + Mutliple clients can register for callback on spawn and some clients might choose to deregister. + + If failed, QURT_EFATAL will be returned. + +@dependencies +None. +*/ +int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, + qurt_process_cb_type_t type, + void *user_data, + qurt_process_callback_arg_t arg1, + qurt_process_callback_arg_t arg2, + qurt_process_callback_arg_t arg3); + + + +/**@ingroup func_qurt_process_callback_deregister +Allows de-registering callback for imate loading. +@param[in] callback_handle Handle returned by qurt_process_callback_register. + +@return +#QURT_EOK --de-registering was successful + +@dependencies +None. +*/ +int qurt_process_callback_deregister(int callback_handle); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SPACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_consts.h new file mode 100755 index 0000000000000..48a8b6a38c402 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_consts.h @@ -0,0 +1,32 @@ +#ifndef QURT_SRM_CONSTS_H +#define QURT_SRM_CONSTS_H +/** + @file qurt_srm_consts.h + @brief Type definitions for srm + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2020-2021, 2022 by Qualcomm Technologies, Inc. All Rights Reserved +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond */ +#define QURT_SRM_WAKEUP_REQUEST 1U << 0 /**< Value = 1: Send wakeup request to the SRM server. */ +#define QURT_SRM_SET_HANDLE 1U << 1 /**< Value = 2: Set the client handle for a new SRM client. */ +#define QURT_SRM_ALLOC_KERNEL_PAGES 1U << 2 /**< Value = 4: Allocate pages from the kernel VA space. */ +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_CONSTS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_driver.h new file mode 100755 index 0000000000000..5489e3dddbcca --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_driver.h @@ -0,0 +1,140 @@ +#ifndef QURT_SRM_DRIVER_H +#define QURT_SRM_DRIVER_H +/** + @file qurt_srm_driver.h + @brief Definitions, macros, and prototypes used by SRM drivers. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. All Rights Reserved. + + =============================================================================*/ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Define qurt_srm_driver_t structure, which represents +|| the "registration" object for an SRM driver. +*/ +/** @cond internal_only */ +struct _qurt_srm_driver { + const char *name; + qurt_qdi_obj_t *obj; +}; + +typedef struct _qurt_srm_driver qurt_srm_driver_t; + +/* +|| qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke(). +|| It behaves the same, but it takes a QDI object pointer instead of a handle. +*/ + +#define qurt_srm_object_invoke(o,m,...) \ + _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__) +#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c) +#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d)) +#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e)) +#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f)) +#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g)) +#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h)) +#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i)) +#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j)) +#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k)) +#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l)) + +int qurt_srm_oi3(int, qurt_qdi_obj_t *, int); +int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int); +int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int); +int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int); +int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int); +int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int); +int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int); +int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int); +int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int); +int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int); + +#define QDI_SRM_INIT 192 + +/* +|| QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure. +|| +|| The three arguments are: +|| unique_id -- Unique C identifier, unused but must be a unique global symbol. +|| name -- Name of the driver by which an SRM client attempts to open it. +|| obj -- Pointer to the singleton object of the driver, which handles things such as +|| initialization and QDI_OPEN requests. +*/ + +#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \ + __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \ + { .name = xname, .obj = xobj } + + +/*@ingroup func_qurt_srm_mapping_create + Creates a memory mapping in pagetable with specified attributes + + @param[in] client_handle Client handle representing the process for which + mapping would be created. + @param[in] pageno_virt pointer to the virtual page. NULL indicates SRM + would indicate the virtual memory. + @param[in] pageno_phys physical page to be used for the mapping + @param[in] page_count number of 4k pages to be mapped + @param[in] cache_attr cache attributes for the mapping + @param[in] perm permissions to be used for the mapping + + @return value greater than 0 indicates a handle which can be passed to + qdi_close() to remove the mapping. Negative value indicates + an error. + + @dependencies + None. +*/ +int qurt_srm_mapping_create(int client_handle, + unsigned *pageno_virt, + unsigned pageno_phys, + unsigned page_count, + qurt_mem_cache_mode_t cache_attr, + qurt_perm_t perm); + + +/**@ingroup func_qurt_srm_get_pid + Gets the PID for the client_handle that is passed. + + @param[in] client_handle Client handle for which PID is required. + + @return PID of the client + Negative PID value '-1' will be returned in case of Error + + @dependencies + None. +*/ +unsigned qurt_srm_get_pid(int client_handle); + + +/*@ingroup func_qurt_srm_get_thread_id + Gets the thread id of the client requesting a service from SRM + + @param[in] None. + + @return thead id of client thread + + @dependencies + None. +*/ +qurt_thread_t qurt_srm_get_client_thread_id(void); + +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_SRM_DRIVER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_stid.h new file mode 100755 index 0000000000000..379f46aaa4b80 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_stid.h @@ -0,0 +1,73 @@ +#ifndef QURT_STID_H +#define QURT_STID_H +/** + @file qurt_stid.h + Prototypes of software thread identifier(stid) interface APIs. + A stid is 8 bit identifier that can be assigned to a software thread. + The performance monitor logic uses stid as a counting match criteria + for maskable events. stid is also used by the hardware debugger + (ISDB) to match breakpoints. + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + + Copyright (c) 2024 Qualcomm Technologies, Inc. + All rights reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_stid_alloc + Allocate a unique stid + + @param[in] pid Process identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - Allocation success + QURT_ENORESOURCE - No stid available for allocation + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_stid_alloc(unsigned int pid, unsigned int *stid); + +/**@ingroup func_qurt_stid_release + Release the stid. + + + @param[in] pid Process identifier + @param[in] stid STID to release + + @note1hang + User shall ensure to clear the released stid from process or thread(s) + to default value (QURT_STID_DEFAULT) before releasing that stid + + @return + QURT_EOK - Release success + QURT_ENOTALLOWED - Operation not allowed for a pid + QURT_EINVALID - Invalid stid + + @dependencies + None. + */ +int qurt_stid_release(unsigned int pid, unsigned int stid); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_STID_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread.h new file mode 100755 index 0000000000000..499699e7c72e2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread.h @@ -0,0 +1,1260 @@ +#ifndef QURT_THREAD_H +#define QURT_THREAD_H +/** + @file qurt_thread.h + @brief Prototypes of Thread API + + EXTERNAL FUNCTIONS + None. + + INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018, 2020-2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +#include +#include "qurt_pmu.h" +#include "qurt_api_version.h" +#endif /* __ASSEMBLER__ */ +#include "qurt_consts.h" +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ + + +/* + Bitmask configuration to select DSP hardware threads. + To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL + and the following: \n + - For QDSP6 V2/V3, all six hardware threads are selected \n + - For QDSP6 V3L, all four hardware threads are selected \n + - For QDSP6 V4, all three hardware threads are selected + */ + +#define QURT_THREAD_CFG_BITMASK_HT0 0x00000001 /**< HTO. */ +#define QURT_THREAD_CFG_BITMASK_HT1 0x00000002 /**< HT1. */ +#define QURT_THREAD_CFG_BITMASK_HT2 0x00000004 /**< HT2. */ +#define QURT_THREAD_CFG_BITMASK_HT3 0x00000008 /**< HT3. */ +#define QURT_THREAD_CFG_BITMASK_HT4 0x00000010 /**< HT4. */ +#define QURT_THREAD_CFG_BITMASK_HT5 0x00000020 /**< HT5. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{sec:qurt_thread_cfg} */ + +#define QURT_THREAD_CFG_BITMASK_ALL 0x000000ffU /**< Select all the hardware threads. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_CFG_USE_RAM 0x00000000 /**< Use RAM. */ +#define QURT_THREAD_CFG_USE_TCM 0x00000100 /**< Use TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_BUS_PRIO_DISABLED 0 /**< Thread internal bus priority disabled. */ +#define QURT_THREAD_BUS_PRIO_ENABLED 1 /**< Thread internal bus priority enabled. */ +/** @} */ /* end_addtogroup thread_macros */ +/** @endcond */ + +#define QURT_THREAD_AUTOSTACK_DISABLED 0 /**< Thread has autostack v2 feature disabled. */ +#define QURT_THREAD_AUTOSTACK_ENABLED 1 /**< Thread has autostack v2 feature enabled. */ + +/* + Macros for QuRT thread attributes. + */ +#define QURT_HTHREAD_L1I_PREFETCH 0x1 /**< Enables hardware L1 instruction cache prefetching. */ +#define QURT_HTHREAD_L1D_PREFETCH 0x2 /**< Enables hardware L1 data cache prefetching. */ +#define QURT_HTHREAD_L2I_PREFETCH 0x4 /**< Enables hardware L2 instruction cache prefetching. */ +#define QURT_HTHREAD_L2D_PREFETCH 0x8 /**< Enables hardware L2 data cache prefetching. */ +#define QURT_HTHREAD_DCFETCH 0x10 /**< Enables DC fetch to the provided virtual address. + DC fetch indicates the hardware that a data memory access is likely. + Instructions are dropped when there is high bus utilization. */ +/** @addtogroup thread_macros +@{ */ +/** @xreflabel{hdr:partition_tcm} */ +/* + Below value is used to create legacy QuRT threads by default. + If a thread has this as the detach_state, the thread can be joined + on until it exits. When we are able to change default behavior of all + QuRT threads to JOINABLE (posix default), we can remove this legacy + behavior. +*/ +#define QURT_THREAD_ATTR_CREATE_LEGACY 0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */ +#define QURT_THREAD_ATTR_CREATE_JOINABLE 1U /**< Create a joinable thread. */ +#define QURT_THREAD_ATTR_CREATE_DETACHED 2U /**< Create a detached thread. */ +/** @} */ /* end_addtogroup thread_macros */ + + +#define QURT_THREAD_ATTR_NAME_MAXLEN 16 /**< Maximum name length. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_RAM 0 /**< Creates threads in RAM/DDR. */ +#define QURT_THREAD_ATTR_TCB_PARTITION_TCM 1 /**< Creates threads in TCM. */ +/** @cond rest_reg_dist */ +/** @addtogroup thread_macros +@{ */ +#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT QURT_THREAD_ATTR_TCB_PARTITION_RAM /**< Backward compatibility. */ +#define QURT_THREAD_ATTR_PRIORITY_DEFAULT 254 /**< Priority.*/ +#define QURT_THREAD_ATTR_ASID_DEFAULT 0 /**< ASID. */ +#define QURT_THREAD_ATTR_AFFINITY_DEFAULT (-1) /**< Affinity. */ +#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT 255 /**< Bus priority. */ +#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT 0 /**< Default autostack v2 disabled thread. */ +#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT (-2) /**< Timetest ID. */ +#define QURT_THREAD_ATTR_STID_DEFAULT QURT_STID_DEFAULT /**< STID. */ +#define QURT_THREAD_ATTR_STID_ENABLE 1 /**< Indicate to allocate STID during thread creation. */ + +#define QURT_PRIORITY_FLOOR_DEFAULT 255U /**< Default floor. */ +/** @} */ /* end_addtogroup thread_macros */ + +// Option for suspending thread +#define QURT_THREAD_SUSPEND_SYNCHRONOUS 0x0U // bit#0 +#define QURT_THREAD_SUSPEND_ASYNCHRONOUS 0x1U // bit#0 +#define QURT_THREAD_SUSPEND_KEEP_HMX 0x0U // bit#1 +#define QURT_THREAD_SUSPEND_DETACH_HMX 0x2U // bit#1 + +// Option for resuming thread +#define QURT_THREAD_RESUME_DEFAULT 0x0 + +// Thread property IDs +#define QURT_THREAD_PROPERTY_SUSPENDABLE 0x0U +#define QURT_THREAD_PROPERTY_RESUMABLE 0x1 + +// Thread group +#define QURT_THREAD_DEFAULT_GROUP_ID 0x0U +#define QURT_THREAD_GROUP_ID_MASK 0x3FU + +/** @endcond*/ + + +/* The followings are for C code only */ +#ifndef __ASSEMBLER__ +/*============================================================================= + TYPEDEFS +=============================================================================*/ +/** @addtogroup thread_types +@{ */ +/** @cond rest_reg_dist */ +typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */ + +#define CCCC_PARTITION 0U /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */ +#define MAIN_PARTITION 1U /**< Use the main partition. */ +#define AUX_PARTITION 2U /**< Use the auxiliary partition. */ +#define MINIMUM_PARTITION 3U /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */ +/** @endcond */ + +/** Thread ID type. */ +typedef unsigned int qurt_thread_t; + +/** @cond rest_reg_dist */ +/** Thread attributes. */ +typedef struct _qurt_thread_attr { + + char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */ + unsigned char tcb_partition; /**< Indicates whether the thread TCB resides in RAM or + on chip memory (TCM). */ + unsigned char stid; /**< Software thread ID used to configure the stid register + for profiling purposes. */ + unsigned short priority; /**< Thread priority. */ + unsigned char autostack:1; /**< Autostack v2 enabled thread. */ + unsigned char group_id:6; /**< Group ID. */ + unsigned char reserved:1; /**< Reserved bits. */ + unsigned char bus_priority; /**< Internal bus priority. */ + unsigned short timetest_id; /**< Timetest ID. */ + unsigned int stack_size; /**< Thread stack size. */ + void *stack_addr; /**< Pointer to the stack address base. The range of the stack is + (stack_addr, stack_addr+stack_size-1). */ + unsigned short detach_state; /**< Detach state of the thread. */ + +} qurt_thread_attr_t; +/** @endcond */ + +/** @cond rest_reg_dist */ +/** Dynamic TLS attributes. */ +typedef struct qurt_tls_info { + unsigned int module_id; /**< Module ID of the loaded dynamic linked library. */ + unsigned int tls_start; /**< Start address of the TLS data. */ + unsigned int tls_data_end; /**< End address of the TLS RW data. */ + unsigned int tls_end; /**< End address of the TLS data. */ +}qurt_tls_info; +/** @endcond */ + +/** @} */ /* end_addtogroup thread_types */ + +/*============================================================================= + FUNCTIONS +=============================================================================*/ +/**@ingroup func_qurt_thread_attr_init + Initializes the structure used to set the thread attributes when a thread is created. + After an attribute structure is initialized, Explicity set the individual attributes in the structure + using the thread attribute operations. + + The initialize operation sets the following default attribute values: \n + - Name -- NULL string \n + - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT + - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n + - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n + - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n + - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n + - stack_size -- 0 \n + - stack_addr -- NULL \n + - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n + - STID -- #QURT_THREAD_ATTR_STID_DEFAULT + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr) +{ + + attr->name[0] = '\0'; + attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT; + attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT; + attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/ + attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT; + attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT; + attr->stack_size = 0; + attr->stack_addr = NULL; + attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY; + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID; +} + +/**@ingroup func_qurt_thread_attr_set_name + Sets the thread name attribute.\n + This function specifies the name to use by a thread. + Thread names identify a thread during debugging or profiling. + Maximum name length is 16 charactes \n + @note1hang Thread names differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] name Pointer to the character string containing the thread name. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name) +{ + strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN); + attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0'; +} + + +/**@ingroup func_qurt_thread_attr_set_tcb_partition + Sets the thread TCB partition attribute. + Specifies the memory type where a TCB of a thread is allocated. + Allocates TCBs in RAM or TCM/LPM. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] tcb_partition TCB partition. Values:\n + - 0 -- TCB resides in RAM \n + - 1 -- TCB resides in TCM/LCM @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition) +{ + attr->tcb_partition = tcb_partition; +} + +/**@ingroup func_qurt_thread_attr_set_priority + Sets the thread priority to assign to a thread. + Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing + the highest priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] priority Thread priority. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority) +{ + attr->priority = priority; +} + +/**@ingroup func_qurt_thread_attr_set_detachstate + Sets the thread detach state with which thread is created. + Thread detach state is either joinable or detached; specified by the following values: + - #QURT_THREAD_ATTR_CREATE_JOINABLE \n + - #QURT_THREAD_ATTR_CREATE_DETACHED \n + + When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread + ID and other resources are reclaimed as soon as the thread exits. When a joinable thread + is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some + thread waits to join on it using a qurt_thread_join() call. + By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY + If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other + thread can join before thread exits but it will not wait other thread to join. + + @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very + important that some thread joins on it after it terminates, otherwise + the resources of that thread are not reclaimed, causing memory leaks. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] detachstate Thread detach state. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate) +{ + if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){ + attr->detach_state = detachstate; + } +} + + +/**@ingroup func_qurt_thread_attr_set_timetest_id + Sets the thread timetest attribute.\n + Specifies the timetest identifier to use by a thread. + + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] timetest_id Timetest identifier value. + + @return + None. + + @dependencies + None. + */ +static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id) +{ + attr->timetest_id = timetest_id; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute.\n + Specifies the size of the memory area to use for a call stack of a thread. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_size Size (in bytes) of the thread stack. + + @return + None. + + @dependencies + None. +*/ + +static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size) +{ + attr->stack_size = stack_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_size2 + @xreflabel{sec:set_stack_size} + Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size + defined in the configuration XML.\n + Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode. + + The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a + call stack for the thread. The user is responsible for allocating the memory area used for + the stack. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] user_stack_size Size (in bytes) of the stack usage in User mode. + @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size) +{ + union qurt_thread_stack_info{ + unsigned int raw_size; + struct{ + unsigned short user_stack; + unsigned short root_stack; + }; + }user_root_stack_size; + user_root_stack_size.user_stack = user_stack_size; + user_root_stack_size.root_stack = root_stack_size; + + attr->stack_size = user_root_stack_size.raw_size; +} + +/**@ingroup func_qurt_thread_attr_set_stack_addr + @xreflabel{sec:set_stack_addr} + Sets the thread stack address attribute. \n + Specifies the base address of the memory area to use for a call stack of a thread. + + stack_addr must contain an address value that is 8-byte aligned. + + The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a + call stack for the thread. \n + @note1hang The user is responsible for allocating the memory area used for the thread + stack. The memory area must be large enough to contain the stack that the thread + creates. + + @datatypes + #qurt_thread_attr_t + + @param[in,out] attr Pointer to the thread attribute structure. + @param[in] stack_addr Pointer to the 8-byte aligned address of the thread stack. + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr) +{ + attr->stack_addr = stack_addr; +} + +/**@ingroup func_qurt_thread_attr_set_bus_priority + Sets the internal bus priority state in the Hexagon core for this software thread attribute. + Memory requests generated by the thread with bus priority enabled are + given priority over requests generated by the thread with bus priority disabled. + The default value of bus priority is disabled. + + @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. + The priority is not propagated to the bus fabric. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + + @param[in] bus_priority Enabling flag. Values: \n + - #QURT_THREAD_BUS_PRIO_DISABLED \n + - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority) +{ + attr->bus_priority = (unsigned char)bus_priority; +} + +/**@ingroup func_qurt_thread_attr_set_autostack + Enables autostack v2 feature in the thread attributes. + + When autostack is enabled by the subsystem, in the case that + an autostack enabled thread gets framelimit exception, kernel will + allocate more stack for thread and return to normal execution. + + If autostack is not enabled by the subsystem, or it is not enabled + for the thread, the framelimit exception will be fatal. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] autostack Autostack enable or disable flag. Values: \n + - #QURT_THREAD_AUTOSTACK_DISABLED \n + - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack) +{ + attr->autostack = (unsigned char)autostack; +} +/**@ingroup qurt_thread_attr_enable_stid + Set STID in the thread attributes. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] enable_stid STID to be set. Values: \n + - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n + - #QURT_THREAD_ATTR_STID_ENABLE (1): QuRT assigns an STID that is not already in use \n + - #2 through #255 : User provided STID. @tablebulletend + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid) +{ + if (enable_stid != '\0') { + attr->stid = enable_stid; + } + else + { + attr->stid = QURT_THREAD_ATTR_STID_DEFAULT; + } +} + +/**@ingroup func_qurt_thread_attr_set_stid + Sets the stid thread attribute. + The default stid value is QURT_THREAD_ATTR_STID_DEFAULT + + @note1hang When a thread is created with non default stid , + the stid set in thread attribute will be assigned to a thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] stid Stid to be set for a thread. + + @return + None + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){ + attr->stid = stid; +} + +/**@ingroup func_qurt_thread_attr_set_group_id + Sets group id in the thread attributes. + Primordial/first thread has group ID 0. + If a new thread is created without assigning group_id, it + inherits the group ID from its parent thread. + + @note1hang + 1) Group ID can only be set before creating a thread. It cannot be + changed after the thread is created. + 2) If a non-activated group_id is passed, thread creation will fail. + 3) Only a thread with Group ID #0 can set Group ID for its child threads. + 4) If thread with non-zero group ID set the group ID for its child threads, + QuRT will ingore this parameter and child threads will inherit the parent + thread's group ID. But if passed group ID is not activated, thread creation + will still fail. + + @datatypes + #qurt_thread_attr_t + + @param[in] attr Pointer to the thread attribute structure. + @param[in] group_id Group identifier. Its valid range is 0 ~ 63 + + @return + None. + + @dependencies + None. +*/ +static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id) +{ + attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK; +} + +/**@ingroup func_qurt_thread_set_autostack + Sets autostack enable in the TCB. + + @param[in] Pointer to UGP + + @return + None. + + @dependencies + None. +*/ + +void qurt_thread_set_autostack(void *); + + +/**@ingroup func_qurt_thread_get_name + Gets the thread name of current thread.\n + Returns the thread name of the current thread. + Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names + identify a thread during debugging or profiling. + + @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored. + @param[in] max_len Maximum length of the character string that can be returned. + + @return + None. + + @dependencies + None. +*/ +void qurt_thread_get_name (char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_create + @xreflabel{hdr:qurt_thread_create} + Creates a thread with the specified attributes, and makes it executable. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[out] thread_id Returns a pointer to the thread identifier if the thread was + successfully created. + @param[in] attr Pointer to the initialized thread attribute structure that specifies + the attributes of the created thread. + @param[in] entrypoint C function pointer, which specifies the main function of a thread. + @param[in] arg Pointer to a thread-specific argument structure + + + @return + #QURT_EOK -- Thread created. \n + #QURT_EFAILED -- Thread not created. + + @dependencies + None. + */ +int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg); + +/**@ingroup func_qurt_thread_stop + Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. + + @return + void + + @dependencies + None. + */ +void qurt_thread_stop(void); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_resume + When a demand-loading paging solution is enabled, this function + will resumes the execution of a thread that was suspended due to + a page miss. + + @param[in] thread_id Thread identifier. + + @return + #QURT_EOK -- Thread successfully resumed. \n + #QURT_EFATAL -- Resume operation failed. + + @dependencies + None. + */ +int qurt_thread_resume(unsigned int thread_id); +/** @endcond */ + +/**@ingroup func_qurt_thread_get_id + Gets the identifier of the current thread.\n + Returns the thread identifier for the current thread. + + @return + Thread identifier -- Identifier of the current thread. + + @dependencies + None. + */ +qurt_thread_t qurt_thread_get_id (void); + + +/**@ingroup func_qurt_thread_get_l2cache_partition + Returns the current value of the L2 cache partition assigned to the caller thread.\n + + @return + Value of the #qurt_cache_partition_t data type. + + @dependencies + None. + */ +qurt_cache_partition_t qurt_thread_get_l2cache_partition (void); + +/**@ingroup func_qurt_thread_set_timetest_id + Sets the timetest identifier of the current thread. + Timetest identifiers are used to identify a thread during debugging or profiling.\n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @param[in] tid Timetest identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_timetest_id (unsigned short tid); + +/**@ingroup func_qurt_thread_set_cache_partition + Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type + to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache. + + @datatypes + #qurt_cache_partition_t + + @param[in] l1_icache L1 I cache partition. + @param[in] l1_dcache L1 D cache partition. + @param[in] l2_cache L2 cache partition. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache); + + +/**@ingroup func_qurt_thread_get_timetest_id + Gets the timetest identifier of the current thread.\n + Returns the timetest identifier of the current thread.\n + Timetest identifiers are used to identify a thread during debugging or profiling. \n + @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to + specify threads in the API thread operations. + + @return + Integer -- Timetest identifier. + + @dependencies + None. + */ +unsigned short qurt_thread_get_timetest_id (void); + +/**@ingroup func_qurt_thread_exit + @xreflabel{sec:qurt_thread_exit} + Stops the current thread, awakens threads joined to it, then destroys the stopped + thread. + + Threads that are suspended on the current thread (by performing a thread join + Section @xref{sec:thread_join}) are awakened and passed a user-defined status value + that indicates the status of the stopped thread. + + @note1hang Exit must be called in the context of the thread to stop. + + @param[in] status User-defined thread exit status value. + + @return + None. + + @dependencies + None. + */ +void qurt_thread_exit(int status); + +/**@ingroup func_qurt_thread_join + @xreflabel{sec:thread_join} + Waits for a specified thread to finish; the specified thread is another thread within + the same process. + The caller thread is suspended until the specified thread exits. When the unspecified thread + exits, the caller thread is awakened. \n + @note1hang If the specified thread has already exited, this function returns immediately + with the result value #QURT_ENOTHREAD. \n + @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish. + If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}). + + @param[in] tid Thread identifier. + @param[out] status Destination variable for thread exit status. Returns an application-defined + value that indicates the termination status of the specified thread. + + @return + #QURT_ENOTHREAD -- Thread has already exited. \n + #QURT_EOK -- Thread successfully joined with valid status value. + + @dependencies + None. + */ +int qurt_thread_join(unsigned int tid, int *status); + +/**@ingroup qurt_thread_detach + @xreflabel{sec:thread_detach} + Detaches a joinable thread. The specified thread is another thread within the + same process. Create the thread as a joinable thread; only joinable threads + can be detached. + If a joinable thread is detached, it finishes execution and exits. + + @param[in] tid Thread identifier. + + @return + #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n + #QURT_EOK -- Thread successfully detached. + + @dependencies + None. + */ +int qurt_thread_detach(unsigned int tid); + + +/**@ingroup func_qurt_thread_get_priority + Gets the priority of the specified thread. \n + Returns the thread priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. \n + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + + @return + -1 -- Invalid thread identifier. \n + 1 through 254 -- Thread priority value. + + @dependencies + None. + */ +int qurt_thread_get_priority (qurt_thread_t threadid); + +/**@ingroup func_qurt_thread_set_priority + Sets the priority of the specified thread.\n + Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower + values representing higher priorities. 1 represents the highest possible thread priority. + Priority 0 and 255 are internally used by the kernel for special purposes. + + @note1hang QuRT can be configured to have different priority ranges. For more + information, see Section @xref{sec:AppDev}. + + @datatypes + #qurt_thread_t + + @param[in] threadid Thread identifier. + @param[in] newprio New thread priority value. + + @return + 0 -- Priority successfully set. \n + -1 -- Invalid thread identifier. \n + + @dependencies + None. + */ +int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio); + + + +/**@ingroup func_qurt_thread_attr_get + Gets the attributes of the specified thread. + + @datatypes + #qurt_thread_t \n + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[out] attr Pointer to the destination structure for thread attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid argument. + + @dependencies + None. + */ +int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr); + + + +/**@ingroup func_qurt_thread_get_tls_base + Gets the base address of thread local storage (TLS) of a dynamically loaded module + for the current thread. + + @datatypes + #qurt_tls_info + + @param[in] info Pointer to the TLS information for a module. + + @return + Pointer to the TLS object for the dynamically loaded module.\n + NULL -- TLS information is invalid. + + @dependencies + None. + */ +void * qurt_thread_get_tls_base(qurt_tls_info* info); + +/**@ingroup func_qurt_thread_pktcount_get + Gets the PKTCOUNT of a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + PKTCOUNT + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_get (qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_pktcount_set + Sets the PKTCOUNT for the current QuRT thread. + + @return + Value to which pktcount is set. + + @dependencies + None. + */ + +long long int qurt_thread_pktcount_set (long long int); + +/**@ingroup func_qurt_thread_stid_get + Gets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] thread_id Thread identifier. + + @return + STID + + @dependencies + None. + */ + +char qurt_thread_stid_get(qurt_thread_t thread_id); + +/**@ingroup func_qurt_thread_stid_get2 + Returns the set stid for a thread + + @param[in] thread_id thread identifier + @param[out] stid Pointer to a variable to return stid + + @return + QURT_EOK - success + QURT_ENOTALLOWED - operation not allowed for a thread + QURT_EINVALID - Invalid input + + @dependencies + None. + */ +int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid); + +/**@ingroup func_qurt_thread_stid_set + Sets the STID for a specified thread. + + @datatypes + #qurt_thread_t + + @param[in] stid Thread identifier. + + @return + #QURT_EOK -- STID set created. \n + #QURT_EFAILED -- STID not set. + + @dependencies + None. + */ + +int qurt_thread_stid_set(char stid); + +/**@ingroup qurt_thread_stid_set2 + Sets the stid for a specified thread. + + @datatypes + #qurt_thread_attr_t + + @param[in] thread_id Thread identifier. + @param[in] stid Stid to be set for a thread. + + @return + QURT_EOK -- Success + #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation. + #QURT_EVAL -- Failure because of invalid inputs. + + @dependencies + None. +*/ +int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); + +/** @cond internal_only */ +/**@ingroup func_qurt_thread_get_running_ids + Returns the thread IDs of the running threads in the system; use only during fatal error handling. + + @datatypes + #qurt_thread_t + + @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1. + + @return + #QURT_EINVALID -- Incorrect argument \n + #QURT_ENOTALLOWED -- API not called during error handling \n + #QURT_EOK -- Success, returns a NULL-terminated array of thread_id + + @dependencies + None. + */ +int qurt_thread_get_running_ids(qurt_thread_t *); +/** @endcond */ + + +/**@ingroup func_qurt_thread_get_thread_id + Gets the thread identifier of the thread with the matching name in the same process + of the caller. + + @datatypes + #qurt_thread_t + + @param[out] thread_id Pointer to the thread identifier. + @param[in] name Pointer to the name of the thread. + + @return + #QURT_EINVALID -- No thread with matching name in the process of the caller \n + #QURT_EOK -- Success + + @dependencies + None. + */ +int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name); + +/**@ingroup func_qurt_sleep + Suspends the current thread for the specified amount of time. + + @note1hang Because QuRT timers are deferrable, this call is guaranteed to block + at least for the specified amount of time. If power-collapse is + enabled, the maximum amount of time this call can block depends on + the earliest wakeup from power-collapse past the specified duration. + + @param[in] duration Duration (in microseconds) for which the thread is suspended. + + @return + None. + + @dependencies + None. + */ +void qurt_sleep (unsigned long long int duration); + + +/**@ingroup func_qurt_system_set_priority_floor + Sets a priority floor to move threads with thread priority lower than the floor out of the running state. + Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they + are not scheduled to run when the thread priority is lower than the floor. + Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. + Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor. + + The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and + sets a new floor, the new floor is associated to its original user process, not the QuRTOS process. + The floor associated to the user process is reset when the user process exits or is killed, but not at the time + when the user thread of the caller exits. + + The priority floor cannot be set to a priority higher than the thread priority of the caller. + + The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor. + + This function is not supported in Island mode. + + After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task + is not scheduled to run. + + @param[in] priority_floor Priority floor. + + @return + #QURT_EOK -- Success \n + #QURT_ENOTALLOWED -- Floor setting is not allowed + + @dependencies + None. + */ +int qurt_system_set_priority_floor (unsigned int priority_floor); + + +/**@ingroup func_qurt_thread_suspend_thread + Suspend a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent process. + After the target thread is suspended, the kernel will not schedule it to run until it is resumed later. + + If the target thread is set as non-suspendable, this function call returns an error without suspending + the target thread. + + If the target thread is already suspended, this function call returns success to confirm + the target thread suspend. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + suspending the target thread. + + If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend + the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is + suspended when it exits the guest OS, before executing the first instruction in the user process. + In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target + thread can runn in the guest OS, and is suspended when exiting the guest OS. + + QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend + those threads. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, multiple options can be ORed. \n + #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call, + the function returns after the thread is completely suspended.\n + #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns + after the kernel acts to suspend the target thread. The target thread + might still be running before it is completely suspended. \n + #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread + if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n + #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock(). + Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only + supported for the caller from the same user process of the target thread, not for a caller from the parent + process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX + context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations + and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option. + If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this + case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended + state without HMX detached. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process. + #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread. + + @dependencies + None. + */ +int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_resume_thread + Resume a QuRT thread with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be a thread from the same user process of the target thread, or from its parent + process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on + the thread priority. + + There is an option argument in this function, with only one default option as of now, + QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way. + + By default, this is an asynchronous function. The function returns after kernel moves the + target thread from suspended state to runnable state. The thread is scheduled to run based on its + thread priority. + + If the target thread is set as non-resumable, this function call does not resume the target thread. + + If the target thread has already resumed, this function confirms that the target thread resumes + by returning success. + + If the target thread is in a secure user process or CPZ process, this function call returns an error without + resuming the operation. + + If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of + suspend-pending on the target thread, and the target thread is not suspended when it exits the + guest OS. + + @param[in] thread_id Thread identifier. + @param[in] option Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread. + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + #QURT_EHMXNOTAVAIL -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume. + + @dependencies + None. + */ +int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option); + + +/**@ingroup func_qurt_thread_set_thread_property + Set a QuRT thread property with its thread identifier. + The target thread can be in a signed user process or an unsigned user process. + The caller thread can be from the same user process of the target thread, or from its parent process. + + If the target thread is in a secure user process, or CPZ process, this function call returns an error without + changing the property of the target thread. + + @param[in] thread_id Thread identifier \n + @param[in] property_id Thread property identifier \n + #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n + #QURT_THREAD_PROPERTY_RESUMEABLE -- thread is resumable. Default is TRUE + @param[in] value Proper value: \n + TRUE(1) -- TRUE for the property \n + FALSE(0) -- FALSE for the property + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Failure because of invalid thread_id input \n + #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process. + + @dependencies + None. + */ +int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value ); + +/**@ingroup func_qurt_thread_get_group_id + Get the group id of the thread specified by thread_id.\n + + @param[in] thread_id Thread identifier + @param[out] group_id Pointer to the variable of group identifier + + @return + #QURT_EOK -- Success \n + #QURT_EINVALID -- Thread id is invalid, or the process has no groups enabled \n + #QURT_ENOTALLOWED -- Operation is not allowed \n + + @dependencies + None. +*/ +int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id); + +#endif /* __ASSEMBLER__ */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread_context.h new file mode 100755 index 0000000000000..bab09deec8889 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread_context.h @@ -0,0 +1,234 @@ +#ifndef QURT_THREAD_CONTEXT_H +#define QURT_THREAD_CONTEXT_H +/** + @file qurt_thread_context.h + @brief Kernel thread context structure + +EXTERNAL FUNCTIONS + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2018-2022 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond internal_only */ + +#define THREAD_ITERATOR_END ((qurt_thread_t)(-1)) /**< Thread iterator is complete. */ + + +/**@ingroup func_qurt_thread_iterator_create +Gives the ability to the caller to enumerate threads in the system. + +@return +Handle of the newly created iterator must be passed for +subsequent operations on the iterator. + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_create(void) +{ + return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE); +} + +/**@ingroup func_qurt_thread_iterator_next +Iterates over the list of threads in the system. + +@datatypes +#qurt_thread_t + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n +Other values indicate a valid thread_id. + +@dependencies +None. +*/ +static inline qurt_thread_t qurt_thread_iterator_next(int iter) +{ + return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT); +} + +/**@ingroup func_qurt_thread_iterator_destroy +Cleans up thread iterator resources. + +@param[in] iter Iterator handle returned by qurt_thread_iterator_create(). + +@return +#QURT_EOK -- Successful completion of operation \n +#QURT_EFATAL -- Invalid handle passed + +@dependencies +None. +*/ +static inline int qurt_thread_iterator_destroy(int iter) +{ + return qurt_qdi_close(iter); +} + +/**@ingroup func_qurt_thread_context_get_tname +Gets the name of the thread from the specified thread ID. + +@param[in] thread_id Thread for which name is returned. +@param[in,out] name Pointer to the local buffer where name is copied back. +@param[in] max_len Size of the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len); + +/**@ingroup func_qurt_thread_context_get_prio +Gets the priority for the specified thread. + +@param[in] thread_id Thread for which priority is returned. +@param[in,out] prio Pointer to the local variable where priority is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio); + +/**@ingroup func_qurt_thread_context_get_pcycles +Gets pcycles for the specified thread. + +@param[in] thread_id Thread for which processor cycles are returned. +@param[in,out] pcycles Pointer to the local variable where processor cycles are written. + +@return +#QURT_EOK -- Success \n +Failure otherwise. + +@dependencies +None. +*/ +int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles); + +/**@ingroup func_qurt_thread_context_get_stack_base +Gets the stack base address for the specified thread. + +@param[in] thread_id Thread for which stack base address is returned. +@param[in,out] sbase Pointer to the local variable where stack base address is written. + +@return +QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase); + +/**@ingroup func_qurt_thread_context_get_stack_size +Gets the stack size for the specified thread. + +@param[in] thread_id Thread for which stack size is returned. +@param[in,out] ssize Pointer to the local variable where stack size is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize); + +/**@ingroup func_qurt_thread_context_get_pid +Gets the process ID for the specified thread. + +@param[in] thread_id Thread for which process ID is returned. +@param[in,out] pid Pointer to the local variable where process id is written. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid); + +/**@ingroup func_qurt_thread_context_get_pname +Gets the process name for the specified thread. + +@param[in] thread_id Represents the thread for which process name is returned. +@param[in, out] name Pointer to the local buffer where process name is copied back. +@param[in] len Length allocated to the local buffer. + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len); + +/** @addtogroup thread_types +@{ */ +/** Structure that defines how TCB is interpreted to crash dump tools.*/ +/* Keys are defined in consts.h */ +struct qurt_debug_thread_info { +/** @cond */ + char name[QURT_MAX_NAME_LEN]; /**< Name of the thread. */ + struct { + unsigned key; + unsigned val; + } os_info[40]; + unsigned gen_regs[32]; /**< General mode registers. */ + unsigned user_cregs[32]; /**< User mode registers. */ + unsigned guest_cregs[32]; /**< Guest mode registers. */ + unsigned monitor_cregs[64]; /**< Monitor mode registers. */ +/** @endcond */ +}; /* should add up to 1K */ +/** @} */ /* end_addtogroup thread_types */ + + +/**@ingroup func_qurt_system_tcb_dump_get +Cleans up thread iterator resources. + +@datatypes +#qurt_thread_t + +@param[in] thread_id Thread on which the operation must be performed. +@param[in, out] ptr Pointer to the local buffer where contents are written. +@param[in] size Size of the debug thread information structure obtained by calling + qurt_system_tcb_dump_get_size(). + +@return +#QURT_EOK -- Success \n +Failure otherwise + +@dependencies +None. +*/ +int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size); +/** @endcond */ + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_THREAD_CONTEXT_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_timer.h new file mode 100755 index 0000000000000..7bdfdb8f3c3df --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_timer.h @@ -0,0 +1,560 @@ +#ifndef QURT_TIMER_H +#define QURT_TIMER_H +/** + @file qurt_timer.h + @brief Prototypes of qurt_timer API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + + +#include "qurt_anysignal.h" +#include "qurt_signal2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/**@addtogroup timer_const_macros +@{ */ +/** + Default values. +*/ +/** @xreflabel{hdr:QURT_TIMER_ONESHOT}*/ +#define QURT_TIMER_DEFAULT_TYPE QURT_TIMER_ONESHOT /**< One shot.*/ +#define QURT_TIMER_DEFAULT_DURATION 1000uL /**< Default duration. */ +#define QURT_TIMER_DEFAULT_EXPIRY 0uL /**< Default expiration. */ + +/** + Conversion from microseconds to timer ticks. + */ +#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + +/** + Conversion from timer ticks to microseconds at the nominal frequency. +*/ +#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks) + +/** Minimum microseconds value is 100 microseconds (sleep timer).*/ +#define QURT_TIMER_MIN_DURATION 100uL + +/** + Maximum microseconds value for Qtimer is 1,042,499 hours. +*/ +#define QURT_TIMER_MAX_DURATION QURT_SYSCLOCK_MAX_DURATION + +/** + Timer clock for Qtimer is 19.2 MHz. +*/ +#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS + +/** + Sleep timer error margin for Qtimer is 1,000 ticks ~52 us. +*/ +#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN + +/* + qurt_timer group defines. +*/ +#define QURT_TIMER_MAX_GROUPS 5U /**< Maximum groups.*/ +#define QURT_TIMER_DEFAULT_GROUP 0U /**< Default groups. */ +/** @} */ /* end_addtogroup timer_const_macros */ + +/** @addtogroup timer_types +@{ */ +/** + QuRT timer types. + */ +typedef enum +{ + QURT_TIMER_ONESHOT = 0, /**< One shot.*/ + /** @xreflabel{hdr:QURT_TIMER_PERIODIC}*/ + QURT_TIMER_PERIODIC /**< Periodic. */ +} qurt_timer_type_t; + + +/*============================================================================= + TYPEDEFS +=============================================================================*/ + +/** QuRT timer type.*/ +typedef unsigned int qurt_timer_t; + +/** QuRT timer duration type. */ +typedef unsigned long long qurt_timer_duration_t; + +/** QuRT timer time type. */ +typedef unsigned long long qurt_timer_time_t; + +typedef void (*pfn_t)(void); +/** QuRT timer attribute type. */ +typedef struct +{ + /** @cond */ + unsigned int magic; /**< Magic number to verify the qmsgq_attr_t pointer. */ + + qurt_timer_duration_t duration; /**< Specifies the duration of the new timer. */ + + qurt_timer_time_t expiry; /**< Specifies the absolute expiry of the new timer. */ + + qurt_timer_duration_t remaining; /**< Specifies the remaining time of an active timer. */ + + qurt_timer_type_t type; /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and + #QURT_TIMER_PERIODIC are supported. */ + + unsigned int group; /**< Group number of the timer; the criterion used to disable or enable the set + of timers. */ + pfn_t pFn; /**< Callback other than the signal set */ + /** @endcond */ +} +qurt_timer_attr_t; + +/** @} */ /* end_addtogroup timer_types */ +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_timer_stop + @xreflabel{sec:qurt_timer_stop} + Stops a running timer. + The timer must be a one-shot timer. + + @note1hang Restart stopped timers with the timer restart operation, + see Section @xref{sec:qurt_timer_restart}. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n + #QURT_EMEM -- Out of memory error. + + @dependencies + None. + */ +int qurt_timer_stop (qurt_timer_t timer); + +/**@ingroup func_qurt_timer_restart + @xreflabel{sec:qurt_timer_restart} + Restarts a stopped timer with the specified duration. The timer must be a one-shot timer. + Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop(). + A restarted timer expires after the specified duration, the starting time is when the function is called. + + @note1hang Timers stop after they have expired or after they are explicitly + stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}. + + @datatypes + #qurt_timer_t \n + #qurt_timer_duration_t + + @param[in] timer Timer object. + @param[in] duration Timer duration (in microseconds) before the restarted timer + expires again. + The valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Invalid timer ID or duration value. \n + #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n + #QURT_EMEM -- Out-of-memory error. + + @dependencies + None. + */ +int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration); + + +/**@ingroup func_qurt_timer_create + Creates a timer.\n + Allocates and initializes a timer object, and starts the timer. + + @note1hang A timer event handler must be defined to wait on the specified signal + to handle the timer event. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t \n + #qurt_anysignal_t + + @param[out] timer Pointer to the created timer object. + @param[in] attr Pointer to the timer attribute structure. + @param[in] signal Pointer to the signal object set when timer expires. + @param[in] mask Signal mask, which specifies the signal to set in the signal object when the + time expires. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to create the timer. \n + #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n + Other error code -- Operation failed. \n + + @dependencies + None. + */ +int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_anysignal_t *signal, unsigned int mask); + +int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, + const qurt_signal2_t *signal, unsigned int mask); + +/**@ingroup func_qurt_timer_attr_init + Initializes the specified timer attribute structure with default attribute values: \n + - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n + - Timer type -- #QURT_TIMER_ONESHOT \n + - Timer group -- #QURT_TIMER_DEFAULT_GROUP + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_init(qurt_timer_attr_t *attr); + + +/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020 +@ingroup func_qurt_timer_attr_set_pfn + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the destination structure for the timer attributes. + @param[in] pFn pFn. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn); + + +/**@ingroup func_qurt_timer_attr_set_duration + Sets the timer duration in the specified timer attribute structure.\n + + The timer duration specifies the interval (in microseconds) between the creation of the + timer object and the generation of the corresponding timer event. + + The timer duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] duration Timer duration (in microseconds). + Valid range is #QURT_TIMER_MIN_DURATION to + #QURT_TIMER_MAX_DURATION. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_attr_set_expiry + Sets the absolute expiry time in the specified timer attribute structure.\n + The timer expiry specifies the absolute time (in microseconds) of the generation of the + corresponding timer event.\n + Timer expiries are relative to when the system first began executing. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_time_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] time Timer expiry. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time); + +/**@ingroup func_qurt_timer_attr_get_duration + Gets the timer duration from the specified timer attribute structure. + The value returned is the duration that was originally set for the timer. + + @note1hang This function does not return the remaining time of an active timer; + use qurt_timer_attr_get_remaining() to get the remaining time. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attributes object + @param[out] duration Pointer to the destination variable for timer duration. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration); + +/**@ingroup func_qurt_timer_attr_get_remaining + Gets the timer remaining duration from the specified timer attribute structure. \n + + The timer remaining duration indicates (in microseconds) how much time remains before + the generation of the next timer event on the corresponding timer. + In most cases this function assumes that the timer attribute structure was obtained by + calling qurt_timer_get_attr(). + + @note1hang This attribute is read-only and thus has no set operation defined for it. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_duration_t + + @param[in] attr Pointer to the timer attribute object. + @param[out] remaining Pointer to the destination variable for remaining time. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining); + +/**@ingroup func_qurt_timer_attr_set_type + Sets the timer type in the specified timer attribute structure. + + The timer type specifies the functional behavior of the timer: \n + - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration + and then generates a single timer event. After this the timer is nonfunctional. \n + - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified + timer duration and then generates a timer event. The result is a series of timer + events with interval equal to the timer duration. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in,out] attr Pointer to the timer attribute structure. + @param[in] type Timer type. Values are: \n + - #QURT_TIMER_ONESHOT -- One-shot timer. \n + - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type); + +/**@ingroup func_qurt_timer_attr_get_type + Gets the timer type from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t \n + #qurt_timer_type_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] type Pointer to the destination variable for the timer type. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type); + +/**@ingroup func_qurt_timer_attr_set_group + Sets the timer group identifier in the specified timer attribute structure.\n + The timer group identifier specifies the group that the timer belongs to. Timer groups are + used to enable or disable one or more timers in a single operation. \n + The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1). + See Section @xref{dox:timers}. + + @datatypes + #qurt_timer_attr_t + + @param[in,out] attr Pointer to the timer attribute object. + @param[in] group Timer group identifier; + Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1). + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group); + +/**@ingroup func_qurt_timer_attr_get_group + Gets the timer group identifier from the specified timer attribute structure. + + @datatypes + #qurt_timer_attr_t + + @param[in] attr Pointer to the timer attribute structure. + @param[out] group Pointer to the destination variable for the timer group identifier. + + @return + None. + + @dependencies + None. + */ +void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group); + +/**@ingroup func_qurt_timer_get_attr + @xreflabel{hdr:qurt_timer_get_attr} + Gets the timer attributes of the specified timer when it was created. + + @datatypes + #qurt_timer_t \n + #qurt_timer_attr_t + + @param[in] timer Timer object. + @param[out] attr Pointer to the destination structure for timer attributes. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr); + +/**@ingroup func_qurt_timer_delete + Deletes the timer.\n + Destroys the specified timer and deallocates the timer object. + + @datatypes + #qurt_timer_t + + @param[in] timer Timer object. + + @return + #QURT_EOK -- Success. \n + #QURT_EVAL -- Argument passed is not a valid timer. + + @dependencies + None. + */ +int qurt_timer_delete(qurt_timer_t timer); + +/**@ingroup func_qurt_timer_sleep + Suspends the current thread for the specified amount of time. + The sleep duration value must be between #QURT_TIMER_MIN_DURATION and + #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). + + @datatypes + #qurt_timer_duration_t + + @param[in] duration Interval (in microseconds) between when the thread is suspended + and when it is re-awakened. + + @return + #QURT_EOK -- Success. \n + #QURT_EMEM -- Not enough memory to perform the operation. + + @dependencies + None. + */ + +int qurt_timer_sleep(qurt_timer_duration_t duration); + +/**@ingroup func_qurt_timer_group_disable + Disables all timers that are assigned to the specified timer group. + If a specified timer is already disabled, ignore it. + If a specified timer is expired, do not process it. + If the specified timer group is empty, do nothing. + + @note1hang When a timer is disabled its remaining time does not change, thus it + cannot generate a timer event. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_disable (unsigned int group); + +/**@ingroup func_qurt_timer_group_enable + Enables all timers that are assigned to the specified timer group. + If a specified timer is already enabled, ignore it. + If a specified timer is expired, process it. + If the specified timer group is empty, do nothing. + + @param[in] group Timer group identifier. + + @return + #QURT_EOK -- Success. + + @dependencies + None. + */ +int qurt_timer_group_enable (unsigned int group); + + +/** + Notifies the timer server recovery from power collapse. The server + must account for any missed interrupts during power collapse. + */ +void qurt_timer_recover_pc (void); + +/** + Determines whether the Qtimer is initialized. + + @return + 0 -- Not initialized. \n + Nonzero -- Initialized. + */ +static inline int qurt_timer_is_init (void) {return 1;} + +/**@ingroup func_qurt_timer_get_ticks + Gets current ticks. The ticks are accumulated since the RTOS + has started. Each tick is equal to a single timer clock + cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer. + + @return + Ticks since system started. + */ +unsigned long long qurt_timer_get_ticks (void); + +#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us) + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TIMER_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tlb.h new file mode 100755 index 0000000000000..b1b2d261d31c0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tlb.h @@ -0,0 +1,215 @@ +#ifndef QURT_TLB_H +#define QURT_TLB_H + +/** + @file qurt_tlb.h + @brief Prototypes of TLB API + The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. + Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed + by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. + In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently + assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. + A new entry is placed in the first available slot. + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2013, 2021, 2023 +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. +=============================================================================*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tlb_entry_create + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (such as if the address is not aligned with the + size), the entry is created and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1. + + @datatypes + #qurt_addr_t \n + #qurt_paddr_t \n + #qurt_mem_cache_mode_t \n + #qurt_perm_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr Physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry is not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_create_64 + Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n + @note1hang If the specified attributes are not valid (the address is not aligned with the + size), the entry is not created, and an error result is returned.\n + @note1cont To set the G bit in the new TLB entry, set the asid argument to -1. + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] paddr_64 64-bit physical memory address. + @param[in] size Size of memory region to map (in bytes). + @param[in] cache_attribs Cache mode (writeback, and so on). + @param[in] perms Access permissions. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully created.\n + #QURT_EFATAL -- Entry was not created; the TLB is full. \n + #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n + #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n + #QURT_EINVALID -- Invalid cache attributes / permissions provided. + + */ +int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid); + +/**@ingroup func_qurt_tlb_entry_delete + Deletes the specified TLB entry from the TLB of the Hexagon processor. + If the specified entry does not exist, no deletion occurs and an error result is returned. + + @param[in] entry_id TLB entry identifier. + + @return + #QURT_EOK -- TLB entry successfully deleted. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_delete (unsigned int entry_id); + +/**@ingroup func_qurt_tlb_entry_query + Searches for the specified TLB entry in the TLB of the Hexagon processor. + If the TLB entry is found, its entry identifier is returned. + + @datatypes + #qurt_addr_t + + @param[out] entry_id TLB entry identifier. + @param[in] vaddr Virtual memory address. + @param[in] asid ASID (space ID). + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid); + +/**@ingroup func_qurt_tlb_entry_set + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[in] entry 64-bit TLB entry to store. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry); + +/**@ingroup func_qurt_tlb_entry_get + Gets the TLB entry. \n + Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor. + + @param[in] entry_id TLB entry identifier. + @param[out] entry 64-bit TLB entry. + + @return + #QURT_EOK -- TLB entry successfully returned. \n + #QURT_EFATAL -- TLB entry does not exist. + + @dependencies + None. + **/ +int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry); + +/**@ingroup func_qurt_tlb_get_pager_physaddrs + Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_phys_addrs Pointer to the return array of pager physical addresses. + + @return + Integer -- Number of addresses returned in array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs); + +/**@ingroup func_qurt_tlb_get_pager_virtaddr + Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager. + Each returned address indicates the starting address of an active page. + +The function return value indicates the number of addresses returned. + + @param[out] pager_virt_addrs Pointer to the return array of pager virtual addresses. + + @return + Integer -- Number of addresses returned in the array. + + @dependencies + None. +*/ + +unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs); + + +/**@ingroup func_qurt_tlb_entry_set2 + Sets the TLB entry by storing an entry at the specified location + in the TLB of the Hexagon processor. An additional option can be passed + to lock the TLB entry in the TLB of the Hexagon processor. + + @param[in] id TLB entry identifier. + @param[in] tlb 64-bit TLB entry to store. + @param[in] lock Nonzero value indicates that the TLB entry must be locked in the hardware TLB. + + @return + #QURT_EOK -- Entry successfully stored in the TLB. \n + #QURT_EFATAL -- Entry not set at the specified location. + + @dependencies + None. + **/ +int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLB_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tls.h new file mode 100755 index 0000000000000..6ec3b39ff5cb0 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tls.h @@ -0,0 +1,100 @@ +#ifndef QURT_TLS_H +#define QURT_TLS_H +/** + @file qurt_tls.h + @brief Prototypes of TLS APIs + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + + +/*============================================================================= + FUNCTIONS +=============================================================================*/ + +/**@ingroup func_qurt_tls_create_key + @xreflabel{sec:tls_create_key} + Creates a key for accessing a thread local storage data item.\n + Subsequent get and set operations use the key value. + + @note1hang The destructor function performs any clean-up operations needed by a thread + local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}). + + @param[out] key Pointer to the newly created thread local storage key value. + @param[in] destructor Pointer to the key-specific destructor function. Passing NULL + specifies that no destructor function is defined for the key. + + @return + #QURT_EOK -- Key successfully created. \n + #QURT_ETLSAVAIL -- No free TLS key available. + + @dependencies + None. + */ +int qurt_tls_create_key (int *key, void (*destructor)(void *)); + +/**@ingroup func_qurt_tls_set_specific + Stores a data item to thread local storage along with the specified key. + + @param[in] key Thread local storage key value. + @param[in] value Pointer to user data value to store. + + @return + #QURT_EOK -- Data item successfully stored. \n + #QURT_EINVALID -- Invalid key. \n + #QURT_EFAILED -- Invoked from a non-thread context. + */ +int qurt_tls_set_specific (int key, const void *value); + +/**@ingroup func_qurt_tls_get_specific + Loads the data item from thread local storage. \n + Returns the data item that is stored in thread local storage with the specified key. + The data item is always a pointer to user data. + + @param[in] key Thread local storage key value. + + @return + Pointer -- Data item indexed by key in thread local storage. \n + 0 (NULL) -- Key out of range. + + @dependencies + None. + */ +void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key); + + +/**@ingroup func_qurt_tls_delete_key + Deletes the specified key from thread local storage. + + @note1hang Explicitly deleting a key does not execute any destructor function that is + associated with the key (Section @xref{sec:tls_create_key}). + + @param[in] key Thread local storage key value to delete. + + @return + #QURT_EOK -- Key successfully deleted. \n + #QURT_ETLSENTRY -- Key already free. + + @dependencies + None. + */ +int qurt_tls_delete_key (int key); + + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TLS_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_trace.h new file mode 100755 index 0000000000000..541f8f1d34bf6 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_trace.h @@ -0,0 +1,317 @@ +#ifndef QURT_TRACE_H +#define QURT_TRACE_H +/** + @file qurt_trace.h + @brief Prototypes of system call tracing helpers API + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2021-2023 by Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + GLOBAL VARIABLES +=============================================================================*/ +/** @cond internal_only */ +/** @addtogroup etm_macros +@{ */ +/* ETM trace types. */ +#define QURT_ETM_TYPE_PC_ADDR (1U<<0) /**< PC address.*/ +#define QURT_ETM_TYPE_MEMORY_ADDR (1U<<1) /**< Memory address. */ +#define QURT_ETM_TYPE_TESTBUS (1U<<2) /**< Test bus. */ +#define QURT_ETM_TYPE_CYCLE_ACCURATE (1U<<3) /**< Cycle accurate. */ +#define QURT_ETM_TYPE_CYCLE_COARSE (1U<<4) /**< Cycle coarse. */ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */ +#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */ +#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/ +#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */ + +/* ETM routes. */ +#define QURT_ETM_ROUTE_TO_QDSS 0U /**< ETM route to QDSS. */ +#define QURT_ETM_ROUTE_TO_Q6ETB 1U /**< ETM route to Q6ETB. */ + +/* ETM filters. */ +#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT 0U /*< Filter all as default. */ +#define QURT_ETM_TRACE_FILTER_HNUM0 (1U<<0) /*< Filter HNUM0. */ +#define QURT_ETM_TRACE_FILTER_HNUM1 (1U<<1) /*< Filter HNUM1. */ +#define QURT_ETM_TRACE_FILTER_HNUM2 (1U<<2) /*< Filter HNUM2. */ +#define QURT_ETM_TRACE_FILTER_HNUM3 (1U<<3) /*< Filter HNUM3. */ +#define QURT_ETM_TRACE_FILTER_HNUM4 (1U<<4) /*< Filter HNUM4. */ +#define QURT_ETM_TRACE_FILTER_HNUM5 (1U<<5) /*< Filter HNUM5. */ +#define QURT_ETM_TRACE_FILTER_HNUM6 (1U<<6) /*< Filter HNUM6. */ +#define QURT_ETM_TRACE_FILTER_HNUM7 (1U<<7) /*< Filter HNUM7. */ +#define QURT_ETM_TRACE_FILTER_HNUM8 (1U<<8) /*< Filter HNUM8. */ +#define QURT_ETM_TRACE_FILTER_HNUM9 (1U<<9) /*< Filter HNUM9. */ +#define QURT_ETM_TRACE_FILTER_HNUM10 (1U<<10) /*< Filter HNUM10. */ +#define QURT_ETM_TRACE_FILTER_HNUM11 (1U<<11) /*< Filter HNUM11. */ +#define QURT_ETM_TRACE_FILTER_HNUM12 (1U<<12) /*< Filter HNUM12. */ +#define QURT_ETM_TRACE_FILTER_HNUM13 (1U<<13) /*< Filter HNUM13. */ +#define QURT_ETM_TRACE_FILTER_HNUM14 (1U<<14) /*< Filter HNUM14. */ +#define QURT_ETM_TRACE_FILTER_HNUM15 (1U<<15) /*< Filter HNUM15. */ +#define QURT_ETM_TRACE_FILTER_ALL QURT_ETM_TRACE_FILTER_ALL_DEFAULT + +#define QURT_ETM_TRACE_FILTER_CLUSTER0 (1<<16) /*< Filter trace cluster0 address. */ +#define QURT_ETM_TRACE_FILTER_CLUSTER1 (1<<17) /*< Filter trace cluster1 address. */ +#define QURT_ETM_TRACE_FILTER_PC_RANGE (1<<19) /*< Filter PC address range. */ + +/* ETM memory source - PC or data access */ +#define QURT_ETM_SOURCE_PC 0U /**< ETM memory source of SAC* is PC. */ +#define QURT_ETM_SOURCE_DATA 1U /**< ETM memory source of SAC* is data. */ + +/* Period between synchronization traces */ +#define QURT_ETM_ASYNC_PERIOD 0 /**< Async.*/ +#define QURT_ETM_ISYNC_PERIOD 1 /**< Isync.*/ +#define QURT_ETM_GSYNC_PERIOD 2 /**< Gsync. */ + +/* ETM enable flags */ +#define QURT_ETM_OFF 0U /**< ETM off. */ +#define QURT_ETM_ON 1U /**< ETM on. */ +/** @endcond */ +/** @} */ /* end_addtogroup etm_macros */ + +/** @addtogroup function_tracing_macro +@{ */ +/* ETM setup return values */ +#define QURT_ETM_SETUP_OK 0 /**< ETM setup OK. */ +#define QURT_ETM_SETUP_ERR 1 /**< ETM setup error. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* ETM breakpoint types */ +#define QURT_ETM_READWRITE_BRKPT 0U /**< ETM read/write breakpoint. */ +#define QURT_ETM_READ_BRKPT 1U /**< ETM read breakpoint. */ +#define QURT_ETM_WRITE_BRKPT 2U /**< ETM write breakpoint. */ +#define QURT_ETM_BRKPT_INVALIDATE 3U /**< Invalidate breakpoint. */ +/** @addtogroup function_tracing_macro +@{ */ +/* ATB status flags */ +#define QURT_ATB_OFF 0 /**< ATB off. */ +#define QURT_ATB_ON 1 /**< ATB on. */ +/** @} */ /* end_addtogroup function_tracing_macro */ +/* DTM enable flags */ +#define QURT_DTM_OFF 0 /**< DTM off. */ +#define QURT_DTM_ON 1 /**< DTM on. */ + +/** @addtogroup function_tracing_datatypes +@{ */ +/**STM trace information. */ +typedef struct qurt_stm_trace_info { + /** @cond */ + unsigned int stm_port_addr[6]; /* STM port address to which trace data must be written.*/ + unsigned int thread_event_id; /* Event ID for context switches.*/ + unsigned int interrupt_event_id; /* Event ID for interrupts. */ + unsigned int marker; /* Marker value that must be written at the beginning of the trace. */ + /** @endcond */ +} qurt_stm_trace_info_t; +/** @} */ /* end_addtogroup function_tracing_datatypes */ +/*============================================================================= + GLOBAL FUNCTIONS +=============================================================================*/ + + +/**@ingroup func_qurt_trace_get_marker + Gets the kernel trace marker.\n + Returns the current value of the kernel trace marker. + The marker consists of a hardware thread identifier and an index into the kernel trace + buffer. The trace buffer records kernel events. + + @note1hang Using this function with qurt_trace_changed() + determines whether certain kernel events occurred in a block of code. + + @return + Integer -- Kernel trace marker. + + @dependencies + None. +*/ +unsigned int qurt_trace_get_marker(void); + +/**@ingroup func_qurt_trace_changed + Determines whether specific kernel events have occurred. \n + Returns a value that indicates whether the specified kernel events are recorded in the + kernel trace buffer since the specified kernel trace marker was obtained. + + The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling + qurt_trace_get_marker(). + @cond rest_dist For more information on the mask value, see the description of the trace_mask element in + @xhyperref{80VB41992,80-VB419-92}. \n @endcond + + @note1hang Used with qurt_trace_get_marker(), this function determines whether + certain kernel events occurred in a block of code.\n + @note1cont This function cannot determine whether a specific kernel event type has + occurred unless that event type has been enabled in the trace_mask element + of the system configuration file. \n + @note1cont QuRT supports the recording of interrupt and context switch events only (such as + a trace_mask value of 0x3). + + @param[in] prev_trace_marker Previous kernel trace marker. + @param[in] trace_mask Mask value that indicates which kernel events to check for. + + @returns + 1 -- Kernel events of the specified type have occurred since the + specified trace marker was obtained.\n + 0 -- No kernel events of the specified type have occurred since the + specified trace marker was obtained. + + @dependencies + None. +*/ +int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask); + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +/** @addtogroup function_tracing_macro +@{ */ +#ifndef QURT_DEBUG +#define QURT_TRACE(str, ...) __VA_ARGS__ + /**< Function tracing is implemented with the QURT_TRACE debug macro, which + optionally generates printf statements both before and after every function call that is + passed as a macro argument. + + For example, in the following macro calls in the source code: + @code + QURT_TRACE(myfunc, my_func(33)) + + @endcode + generates the following debug output: + @code + myfile:nnn: my_func >>> calling my_func(33) + myfile:nnn: my_func >>> returned my_func(33) + @endcode + The debug output includes the source file and line number of the function call, along with + the text of the call. Compile the client source file with -D __FILENAME__ + defined for its file name. + + The library function qurt_printf() generates the debug output. + The QURT_DEBUG symbol controls generation of the debug output. If this symbol is + not defined, function tracing is not generated.\n + @note1hang The debug macro is accessed through the QuRT API header file. + */ +#else +#define QURT_TRACE(str, ...) \ + do { \ + qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + __VA_ARGS__; \ + qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \ + } while (0); +#endif +/** @} */ /* end_addtogroup function_tracing_macro */ + +/**@ingroup func_qurt_etm_set_pc_range + Sets the PC address range for ETM filtering. + Depending on the Hexagon core design, a maximum of four PC ranges are supported. + + @param[in] range_num 0 to 3. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_range + Sets the address range for ETM filtering. + It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA. + + @param[in] addr_source_type Type of the address source:\n + - #QURT_ETM_SOURCE_PC \n + - #QURT_ETM_SOURCE_DATA @tablebulletend + @param[in] trig_block_num 0 to 3. + @param[in] pid pid of the process + 1. Any valid PID number will enable the ASID based trace filtering. + 2. QURT_ETM_NO_PID - Disable the ASID based trace filtering. + @param[in] low_addr Lower boundary of PC address range. + @param[in] high_addr Higher boundary of PC address range. + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. +*/ +unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr); + +/**@ingroup func_qurt_etm_set_atb + Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled. + QuRT performs the corresponding actions at low power management. + + @param[in] flag Values: \n + #QURT_ATB_ON \n + #QURT_ATB_OFF + + @returns + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure + + @dependencies + None. +*/ +unsigned int qurt_etm_set_atb(unsigned int flag); + +/**@ingroup func_qurt_etm_set_sync_period + Sets the period for types of synchronization trace packets. \n + ASYNC defines the period between alignment synchronization packets. + Period is in terms of bytes in the packet stream. \n + ISYNC defines the period between instruction synchronization packets. + Period is per thread and is defined as the bytes sent out for that thread. \n + GSYNC is the defined period in thread cycles between GSYNC packets. + + @param[in] sync_type Type of synchronization packets: \n + #QURT_ETM_ASYNC_PERIOD \n + #QURT_ETM_ISYNC_PERIOD \n + #QURT_ETM_GSYNC_PERIOD + @param[in] period Period value. + + @return + #QURT_ETM_SETUP_OK -- Success. \n + #QURT_ETM_SETUP_ERR -- Failure. + + @dependencies + None. + */ +unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period); + +/**@ingroup func_qurt_stm_trace_set_config + Sets up a STM port for tracing events. + + @datatypes + #qurt_stm_trace_info_t + + @param[in] stm_config_info Pointer to the STM trace information used to set up the trace + in the kernel. + The strucure must have the following:\n + - One port address per hardware thread \n + - Event ID for context switches \n + - Event ID for interrupt tracing n + - Header or marker to identify the beginning of the trace. @tablebulletend + + @return + #QURT_EOK -- Success. \n + #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table. + + @dependencies + None. + */ +unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TRACE_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_types.h new file mode 100755 index 0000000000000..bdb83a3fe2fb2 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_types.h @@ -0,0 +1,294 @@ +#ifndef QURT_TYPES_H +#define QURT_TYPES_H +/** + @file qurt_types.h + @brief Contains types common to all configurations + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) Qualcomm Technologies, Inc. +All Rights Reserved. +Confidential and Proprietary - Qualcomm Technologies, Inc. + +=============================================================================*/ + + +//#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================= + CONSTANTS AND MACROS +=============================================================================*/ +#define PGA_BITFIELD_MASK(hi,lo) (((~0u)>>(31U-((hi)-(lo))))<<(lo)) +#define PGA_BITFIELD_GET(x,hi,lo) (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo)) +#define PGA_BITFIELD_INS(hi,lo,v) (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo))) +#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v))) +#define QURT_PGATTR_C_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 3U, 0U) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_GET(pga) PGA_BITFIELD_GET((pga).pga_value, 5U, 4U) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_C_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v)) /* Bits 3-0: cache */ +#define QURT_PGATTR_A_SET(pga,v) PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v)) /* Bits 5-4: bus attr */ +#define QURT_PGATTR_MKRAW(v) ((qurt_pgattr_t){.pga_value = (v)}) +#define QURT_PGATTR_MK(c,a) QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a))) + +/*return types for qurt_island_get_status2*/ +#define QURT_ISLAND_MODE_NORMAL 0U /**< Normal operating mode */ +#define QURT_ISLAND_MODE_ISLAND 1U /**< Island mode */ +#define QURT_ISLAND_MODE_EXITING 2U /**< In transition from Island mode to Normal mode */ + +/*============================================================================= + FORWARD DECLARATIONS & TYPEDEFS +=============================================================================*/ +/** @addtogroup memory_management_types +@{ */ +typedef unsigned int qurt_addr_t; /**< QuRT address type.*/ +typedef unsigned int qurt_paddr_t; /**< QuRT physical memory address type. */ +/** @cond rest_reg_dist */ +typedef unsigned long long qurt_addr_64_t; /**< QuRT 64-bit memory address type. */ +typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */ +typedef unsigned int qurt_mem_region_t; /**< QuRT memory regions type. */ +typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */ +/**@endcond */ +typedef unsigned int qurt_mem_pool_t; /**< QuRT memory pool type.*/ +typedef unsigned int qurt_size_t; /**< QuRT size type. */ +/** @cond */ +typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */ +#define QURT_PHYSPOOL_NAME_LEN (32) +typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN]; + + +/* + * Mapping type + * + * QMEM_MAPPING_VIRTUAL is the default mode, in which the system + * picks up the available range of the virtual address, and maps it to + * available contiguous physical addresses. Physical-to-virtual + * is not guaranteed to be 1:1; both virtual and physical memory is + * contiguous. + * + * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address; + * the kernel allocates 1:1 physical-to-virtual memory. Primary use of + * of this mapping is to allocate physical-to-virtual memory 1:1. + * + * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might + * not be the same as the physical address. But the physical address of the + * memory region is guaranteed to be contiguous starting at the provided + * address, it is required to provide a fixed physical address. The primary + * use of this mapping is to allocate physical memory from a particular + * address, where 1:1 physical-to-virtual is not required. + * + * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory + * area (VMA); no physical memory is reserved or mapped to this virtual + * space; all standard qmem_region APIs apply to a VMA, however physical + * address is always INVALID_ADDR. qmem_region_create() in this mode + * returns a handle to the VMA, both virt_addr and phys_addr must + * be set to INVALID_ADDR, kernel allocates any available virtual + * memory of the specified size. Obtain the starting virtual address + * of VMA through qmem_region_attr_getvirtaddr(). + * Primary purpose of this mapping mode is to provide a mechanism for + * delayed binding in QuRT, for example reserve virtual memory and map it at + * some later time to possibly discontiguous physical blocks. Thus, a + * single VMA can be partitioned among several physical-virtual mappings + * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode. + * Each VMA keeps track of associated mapped regions. + * Deletion of VMA succeeds only if all associated "virtual_fixed" + * regions are freed prior to VMA deletion. + * + * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region + * from virtual space that has been reserved via qmem_region_create() + * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if + * phys_addr is specified, the kernel attempts to map it accordingly, + * if no phys_addr is specified, kernel maps any available physical + * memory. All standard qmem_region APIs apply to such region. Remapping + * a virtual range without prior freeing of the region is not permitted. + * When such region is deleted its corresponding VMA remains intact. + * + * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous + * virtual memory but physical memory can be discontiguous. This method + * tries to club small physical memory blocks to obtain requested + * memory and is useful in case where there is no contiguous full block + * of requested size. If client does not need contiguous physical memory, + * (for example, if client does not use physical addressing), this helps + * use smaller physical memory blocks rather than using contiguous memory. + * Note: When memory is allocated through this method, physical address is + * not returned to the caller using the qurt_mem_region_attr_get() API as there might + * not be a single physical address. + * + */ +/**@endcond */ +/** QuRT memory region mapping type. */ +typedef enum { + QURT_MEM_MAPPING_VIRTUAL=0, /**< Default mode. The region virtual address range maps to an + available contiguous area of physical memory. For the most + efficient use of virtual memory, the QuRT system + chooses the base address in physical memory. This works for most memory + use cases.*/ + QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1, /**< The region virtual address space must be mapped to a + contiguous area of physical memory. This is necessary when the + memory region is accessed by external devices that bypass Hexagon + virtual memory addressing. The base address in physical + memory must be explicitly specified.*/ + QURT_MEM_MAPPING_IDEMPOTENT=2, /**< Region virtual address space maps + to the identical area of physical memory. */ + QURT_MEM_MAPPING_VIRTUAL_FIXED=3, /**< Virtual address space of the region maps either to the + specified area of physical memory or (if no area is specified) + to available physical memory. Use this mapping to create + regions from virtual space that was reserved by calling + qurt_mem_region_create() with mapping. */ + QURT_MEM_MAPPING_NONE=4, /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not + permitted without first deleting the memory region. When such a region is + deleted, its corresponding virtual memory addressing remains intact. */ + QURT_MEM_MAPPING_VIRTUAL_RANDOM=7, /**< System chooses a random virtual address and + maps it to available contiguous physical addresses.*/ + QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical + memory blocks. This helps when there are smaller contiguous blocks + than the requested size. + Physical address is not provided as part of the get_attr call */ + QURT_MEM_MAPPING_INVALID=10, /**< Reserved as an invalid mapping type. */ +} qurt_mem_mapping_t; + + +/** QuRT cache mode type. */ +typedef enum { + QURT_MEM_CACHE_WRITEBACK=7, /**< Write back. */ + QURT_MEM_CACHE_NONE_SHARED=6, /**< Normal uncached memory that can be shared with other subsystems.*/ + QURT_MEM_CACHE_WRITETHROUGH=5, /**< Write through. */ + QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0, /**< Write back non-L2-cacheable.*/ + QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1, /**< Write through non-L2-cacheable. */ + QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK, /**< Write back L2 cacheable. */ + QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH, /**< Write through L2 cacheable. */ + QURT_MEM_CACHE_DEVICE = 4, /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/ + QURT_MEM_CACHE_NONE = 4, /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */ + QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */ + QURT_MEM_CACHE_INVALID=10, /**< Reserved as an invalid cache type. */ +} qurt_mem_cache_mode_t; + +/** Memory access permission. */ +#define QURT_PERM_NONE 0x0U /**< No permission. */ +#define QURT_PERM_READ 0x1U /**< Read permission. */ +#define QURT_PERM_WRITE 0x2U /**< Write permission. */ +#define QURT_PERM_EXECUTE 0x4U /**< Execution permission. */ +#define QURT_PERM_NODUMP 0x8U + /**< Skip dumping the mapping. During process domain dump, must skip + some mappings on host memory to avoid a race condition + where the memory is removed from the host and DSP process + crashed before the mapping is removed. */ +#define QURT_PERM_FULL QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE /**< Read, write, and execute permission. */ + +typedef unsigned char qurt_perm_t; + + +/** @cond rest_reg_dist*/ +/** QuRT cache type; specifies data cache or instruction cache. */ +typedef enum { + QURT_MEM_ICACHE, /**< Instruction cache.*/ + QURT_MEM_DCACHE /**< Data cache.*/ +} qurt_mem_cache_type_t; + +/** QuRT cache operation code type. */ +typedef enum { + QURT_MEM_CACHE_FLUSH, /**< Flush. */ + QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */ + QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */ + QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */ + QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */ + QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/ +} qurt_mem_cache_op_t; + +/** QuRT memory region type. */ +typedef enum { + QURT_MEM_REGION_LOCAL=0, /**< Local. */ + QURT_MEM_REGION_SHARED=1, /**< Shared.*/ + QURT_MEM_REGION_USER_ACCESS=2, /**< User access. */ + QURT_MEM_REGION_FS=4, /**< FS. */ + QURT_MEM_REGION_INVALID=10, /**< Reserved as an invalid region type. */ +} qurt_mem_region_type_t; + +/* Cache and bus attributes are combined into a value of this type for convenience, + and macros for combining and extracting fields are defined here. */ +/** @cond */ +struct qurt_pgattr { + unsigned pga_value; /**< PGA value.*/ +}; +typedef struct qurt_pgattr qurt_pgattr_t; +/** @endcond */ +/** QuRT memory region attributes type.*/ +/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr. + virtaddr cannot be specified for a memory region, it can only be queried by the + qmem_attr_getvirtaddr() function. + */ +typedef struct { + /** @cond */ + qurt_mem_mapping_t mapping_type; + unsigned char perms; + unsigned short owner; + qurt_pgattr_t pga; + unsigned ppn; //physical page number (physical>>12) + qurt_addr_t virtaddr; + qurt_mem_region_type_t type; + qurt_size_t size; + /** @endcond */ +} qurt_mem_region_attr_t; + + +/** QuRT user physical memory pool type. */ +typedef struct { + /** @cond */ + char name[32]; + struct ranges{ + unsigned int start; + unsigned int size; + } ranges[MAX_POOL_RANGES]; + /** @endcond */ +} qurt_mem_pool_attr_t; + +/** QuRT memory pool status type.*/ +typedef struct _qurt_mem_pool_status { + + qurt_size_t contig_size; /**< Largest contiguous free memory in bytes. */ + qurt_size_t free_size; /**< Total free memory in bytes. */ + qurt_size_t total_size; /**< Total declared memory in bytes. */ + +} qurt_mem_pool_status_t; + +typedef enum { + HEXAGON_L1_I_CACHE = 0, /**< Hexagon L1 instruction cache. */ + HEXAGON_L1_D_CACHE = 1, /**< Hexagon L1 data cache. */ + HEXAGON_L2_CACHE = 2 /**< Hexagon L2 cache. */ +} qurt_cache_type_t; + +typedef enum { + FULL_SIZE = 0, /**< Fully shared cache, without partitioning. */ + HALF_SIZE = 1, /**< 1/2 for main, 1/2 for auxiliary. */ + THREE_QUARTER_SIZE = 2, /**< 3/4 for main, 1/4 for auxiliary. */ + SEVEN_EIGHTHS_SIZE = 3 /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */ +} qurt_cache_partition_size_t; + +typedef enum { + QURT_PROCESS_CB_GENERIC, /**< generic unconditional cb called after image loading. */ + QURT_PROCESS_NOTE_CB_PRE_MAP, /**< note cb called before segment loading. */ + QURT_PROCESS_NOTE_CB_POST_MAP /**< note cb called after segment loading. */ +} qurt_process_cb_type_t; + +typedef union { + void *ptr; + int num; +} qurt_process_callback_arg_t; + + +/**@endcond*/ + +/** @} */ /* end_addtogroup memory_management_types */ +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif /* QURT_TYPES_H */ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_user_dma.h new file mode 100755 index 0000000000000..e05a6429fd703 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_user_dma.h @@ -0,0 +1,44 @@ +#ifndef QURT_USER_DMA_H +#define QURT_USER_DMA_H + +/** + @file qurt_user_dma.h + @brief Definitions, macros, and prototypes used for handling user DMA. + + EXTERNALIZED FUNCTIONS + none + + INITIALIZATION AND SEQUENCING REQUIREMENTS + none + + Copyright (c) 2021 by Qualcomm Technologies, Inc. All Rights Reserved. + Confidential and Proprietary - Qualcomm Technologies, Inc. + ======================================================================*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/**@ingroup qurt_user_dma_dmsyncht + Sends the DMSyncht command to the user DMA engine. + + Call this function to ensure all posted DMA memory operations are + complete. + + This stalls the current thread until the instruction + is complete and returns. + + @return + QURT_EOK - On dmsyncht completion \n + QURT_ENOTSUPPORTED - User DMA not supported + + @dependencies + None. +*/ +int qurt_user_dma_dmsyncht(void); + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_vtlb.h new file mode 100755 index 0000000000000..e064042e447ac --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_vtlb.h @@ -0,0 +1,76 @@ +/*============================================================================= + + qurt_vtlb.h + +GENERAL DESCRIPTION + +EXTERNAL FUNCTIONS + None. + +INITIALIZATION AND SEQUENCING REQUIREMENTS + None. + +Copyright (c) 2019, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved. +=============================================================================*/ +#ifndef QURT_VTLB_H +#define QURT_VTLB_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* +|| Names starting with "qurt_i_vtlb" are the internal low-level functions. +|| These should be considered subject to change. +*/ + +int qurt_i_vtlb_entry_create(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension); + +int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex, + unsigned tlb_lo, + unsigned tlb_hi, + unsigned extension, + unsigned target_pid); + +int qurt_i_vtlb_entry_delete(unsigned index); + +int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo); + +int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension); + +int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid); + +int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex); + +int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid); + + +int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries + // stats[1] -- number of available VTLB entries + // stats[2] -- max size of VTLB tree since boot + +//can return index to an entry that was specialed, change it to take addresses instead of pages +int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size); + +int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index); + +#define QURT_VTLB_EXT_DEFAULT 0U +#define QURT_VTLB_EXT_LOCKED 1U +#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U /* Temporary ability to skip certain mappings in pd dump */ +#define QURT_VTLB_EXT_FREELIST 0x800000u + +#define QURT_VTLB_ERR_OVERLAP -64 +#define QURT_VTLB_ERR_TREE_NO_SPACE -65 +#define QURT_VTLB_ERR_INVALID_SIZE -68 +#define QURT_VTLB_ERR_INVALID_EXT -69 +#define QURT_VTLB_ERR_DEL_PGT_LOCKED -70 +#define QURT_VTLB_ERR_PGT_LOCK_CNT -71 + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif + +#endif // QURT_VTLB_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libposix.a new file mode 100755 index 0000000000000..a9a8baba7faf1 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurt.a new file mode 100755 index 0000000000000..0ba0327f99d81 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurtcfs.a new file mode 100755 index 0000000000000..339de1f596ddb Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_island.a new file mode 100755 index 0000000000000..98d0a1128a8a4 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_island.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_main.a new file mode 100755 index 0000000000000..e95a77af5ed1a Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_main.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libposix.a new file mode 100755 index 0000000000000..6aaca8da9e012 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libposix.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurt.a new file mode 100755 index 0000000000000..ba96bbf241f10 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurt.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurtcfs.a new file mode 100755 index 0000000000000..339de1f596ddb Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurtcfs.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libtimer.a new file mode 100755 index 0000000000000..f2f6d0b611216 Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libtimer.a differ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/.lock b/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/.lock new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/dsp_capabilities_utils.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/dsp_capabilities_utils.h new file mode 100755 index 0000000000000..2cafe29dfe9aa --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/dsp_capabilities_utils.h @@ -0,0 +1,164 @@ +/**============================================================================= +@file + dsp_capabilities_utils.h + +@brief + Wrapper functions for FastRPC Capability APIs. + +Copyright (c) 2020-2021 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ +#ifndef DSP_CAPABILITIES_UTILS_H +#define DSP_CAPABILITIES_UTILS_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include +#include +#include "AEEStdErr.h" +#include "remote.h" + +#if !defined (_WINDOWS) + #pragma weak remote_system_request +#endif + /** + * Wrapper for FastRPC Capability API: query DSP support. + * + * @param[out] domain pointer to supported domain. + * @return 0 if query is successful. + * non-zero if error, return value points to the error. + */ +int get_dsp_support(int *domain); + + /** + * Wrapper for FastRPC Capability API: query VTCM information. + * + * @param[in] domain value of domain in the queried. + * @param[out] capability capability value of the attribute queried. + * @param[in] attr value of the attribute to the queried. + * @return 0 if query is successful. + * non-zero if error, return value points to the error. + */ +int get_vtcm_info(int domain, uint32_t *capability, uint32_t attr); + + /** + * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain. + * + * @return true if unsigned pd is supported. + * false if unsigned pd is not supported, capability query failed. + */ + +bool get_unsignedpd_support(void); + + /** + * Wrapper for FastRPC Capability API: query unsigned pd support. + * + * @param[in] domain value of domain in the queried. + * @return true if unsigned pd is supported. + * false if unsigned pd is not supported, capability query failed. + */ + +bool is_unsignedpd_supported(int domain_id); + + /** + * is_valid_domain_id API: query a domain id is valid. + * + * @param[in] domain value of domain in the queried. + * @param[in] compute_only value of domain is only compared with CDSP domains supported by the target when enabled. + * @return true if value of domain is valid. + * false if value of domain is not valid. + */ + +bool is_valid_domain_id(int domain_id, int compute_only); + + /** + * get_domain API: get domain struct from domain value. + * + * @param[in] domain value of a domain + * @return Returns domain struct of the domain if it is supported or else + * returns NULL. + * + */ + +domain* get_domain(int domain_id); + + /** + * get_domains_info API: get information for all the domains available on the device + * + * @param[in] domain_type pointer to domain type + * @param[in] num_domains pointer to number of domains + * @param[in] domains_info pointer to save discovered domains information. + * @return 0 if query is successful. + * non-zero if error, return value points to the error. + * + * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application. + * + */ + +int get_domains_info(char *domain_type, int *num_domains, fastrpc_domain **domains_info); + + /** + * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not + * + * @param[in] domain_id value of a domain + * @return Returns true or false stating support of Async FastRPC + * + */ + +bool is_async_fastrpc_supported(int domain_id); + + /** + * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information + * + * @param[in] domain_id value of a domain + * @return Returns true or false stating status notification support information + * + */ +bool is_status_notification_supported(int domain_id); + + /** + * get_hmx_support_info API: query the DSP for HMX SUPPORT information + * + * @param[in] domain_id value of a domain + * @param[out] capability capability value of the attribute queried. + * @param[in] attr value of the attribute to the queried. + * @return 0 if query is successful. + * non-zero if error, return value points to the error. + * + */ +int get_hmx_support_info(int domain, uint32_t *capability, uint32_t attr); + + /** + * get_hex_arch_ver API: query the Hexagon processor architecture version information + * + * @param[in] domain_id value of a domain + * @param[out] capability capability value of the attribute queried. + * The last byte of the capability value represents the architecture of the DSP being queried in hexadecimal format. + * Eg. 0x8D73 represents a v73 architecture. The other byte stands for other capabilities depending on the device. + * @return 0 if query is successful. + * non-zero if error, return value points to the error. + * + */ +int get_hex_arch_ver(int domain, uint32_t *capability); + + /** + * get_hvx_support_info API: query the DSP for HVX SUPPORT information + * + * @param[in] domain_id value of a domain + * @param[out] capability capability value of the attribute queried. + * @param[in] attr value of the attribute to the queried. + * @return 0 if query is successful. + * non-zero if error, return value points to the error. + * + */ +int get_hvx_support_info(int domain, uint32_t *capability, uint32_t attr); + + +#ifdef __cplusplus +} +#endif + +#endif //DSP_CAPABILITIES_UTILS_H diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/mem_utils.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/mem_utils.h new file mode 100755 index 0000000000000..16f002ef060fa --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/mem_utils.h @@ -0,0 +1,54 @@ +/**============================================================================= +@file + mem_utils.h + +@brief + Abstract operating system specific timing APIs. + +Copyright (c) 2021 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ + +#ifndef MEM_UTILS_H_ +#define MEM_UTILS_H_ + + +#ifdef __cplusplus + extern "C" { +#endif + + +#ifdef _WINDOWS + #include +#else +#ifdef __hexagon__ +#else + #include + #include +#endif +#endif + + +#ifndef MEMALIGN + #ifdef _WINDOWS + #define MEMALIGN(alignment,size) _aligned_malloc(size,alignment) + #else + #define MEMALIGN(alignment,size) memalign(alignment,size) + #endif +#endif + + +#ifndef ALIGNED_FREE + #ifdef _WINDOWS + #define ALIGNED_FREE(ptr) _aligned_free(ptr) + #else + #define ALIGNED_FREE(ptr) free(ptr) + #endif +#endif + +#ifdef __cplusplus +} +#endif + + +#endif //MEM_UTILS_H_ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/os_defines.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/os_defines.h new file mode 100755 index 0000000000000..a7b5947fa908e --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/os_defines.h @@ -0,0 +1,97 @@ +/**============================================================================= +@file + os_defines.h + +@brief + Abstract operating system specific defines, includes and global variables + to make it convenient for developers to code for multiple OS platforms. + +Copyright (c) 2021 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ + +#ifndef OS_DEFINES_H_ +#define OS_DEFINES_H_ + + +#ifdef __cplusplus + extern "C" { +#endif + + +/* Offset to differentiate HLOS and Hexagon error codes. + Stores the value of AEE_EOFFSET for Hexagon. */ +#ifndef DSP_OFFSET + #define DSP_OFFSET 0x80000400 +#endif + + +/* Errno for connection reset by peer. */ +#ifndef ECONNRESET + #ifdef __hexagon__ + #define ECONNRESET 104 + #endif +#endif + + +/* Abstraction of different OS specific sleep APIs. + SLEEP accepts input in seconds. */ +#ifndef SLEEP + #ifdef __hexagon__ + #define SLEEP(x) {/* Do nothing for simulator. */} + #else + #ifdef _WINDOWS + #define SLEEP(x) Sleep(1000*x) /* Sleep accepts input in milliseconds. */ + #else + #define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */ + #endif + #endif +#endif + + +/* Include windows specific header files. */ +#ifdef _WINDOWS + #include + #include + #define _CRT_SECURE_NO_WARNINGS 1 + #define _WINSOCK_DEPRECATED_NO_WARNINGS 1 + /* Including this file for custom implementation of getopt function. */ + #include "getopt_custom.h" +#endif + + +/* Includes and defines for all HLOS except windows */ +#if !defined(__hexagon__) && !defined (_WINDOWS) + #include "unistd.h" + #include +#endif + + +/* Includes and defines for Hexagon and all HLOS except Windows. */ +#if !defined (_WINDOWS) + /* Weak reference to remote symbol for compilation. */ + #pragma weak remote_session_control + #pragma weak remote_handle_control + #pragma weak remote_handle64_control + #pragma weak fastrpc_mmap + #pragma weak fastrpc_munmap +#endif + + +/* Includes and defines for hexagon */ +#ifdef __hexagon__ +#endif + + +/* Includes and defines for Android */ +#ifdef ANDROID +#endif + + +#ifdef __cplusplus +} +#endif + + + +#endif //OS_DEFINES_H_ diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/pd_status_notification.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/pd_status_notification.h new file mode 100755 index 0000000000000..6579058b8e451 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/pd_status_notification.h @@ -0,0 +1,25 @@ +#include +#include +#include "AEEStdErr.h" +#include "remote.h" +#include "dsp_capabilities_utils.h" + + + /** + * request_status_notifications_enable API: Allow users to enable status notification from client PD. + * + * @param[in] domain value of a domain + * @param[in] Context of the client + * @param[in] callback function for status notification + * @return 0 if successful. + * non-zero if error, return value points to the error. + * + */ + +#ifdef __cplusplus +extern "C" { +#endif +int request_status_notifications_enable(int domain_id, void *context, int(*notif_callback_fn)(void *context, int domain, int session, remote_rpc_status_flags_t status)); +#ifdef __cplusplus +} +#endif diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/time_utils.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/time_utils.h new file mode 100755 index 0000000000000..969747f26f309 --- /dev/null +++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/time_utils.h @@ -0,0 +1,64 @@ +/**============================================================================= +@file + time_utils.h + +@brief + Abstract operating system specific timing APIs. + +Copyright (c) 2021 Qualcomm Technologies Incorporated. +All Rights Reserved. Qualcomm Proprietary and Confidential. +=============================================================================**/ + +#ifndef TIME_UTILS_H_ +#define TIME_UTILS_H_ + + +#ifdef __cplusplus + extern "C" { +#endif + +#ifdef _WINDOWS + #include +#else +#ifdef __hexagon__ + #include "hexagon_sim_timer.h" +#else + #include +#endif +#endif + +unsigned long long get_time(void); +void sleep_in_microseconds(unsigned long long); + +/* Abstraction of different OS specific usleep APIs. + USLEEP accepts input in microseconds. */ +#ifndef USLEEP + #ifdef __hexagon__ + #define USLEEP(x) {/* Do nothing for simulator. */} + #else + #ifdef _WINDOWS + #define USLEEP(x) sleep_in_microseconds(x) + #else + #include + #define USLEEP(x) usleep(x) + #endif + #endif +#endif + +/* Abstraction of different OS specific timer APIs. + GET_TIME returns the value of time*/ +#ifndef GET_TIME + #ifdef __hexagon__ + #define GET_TIME hexagon_sim_read_pcycles + #else + #define GET_TIME get_time + #endif +#endif + + +#ifdef __cplusplus +} +#endif + + +#endif //TIME_UTILS_H_ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h new file mode 100755 index 0000000000000..fdbfc1136d556 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h @@ -0,0 +1,50 @@ +//============================================================================= +// +// Copyright (c) 2020-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN CPU Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for CPU backend + */ + +#ifndef QNN_CPU_COMMON_H +#define QNN_CPU_COMMON_H + +#include "QnnCommon.h" + +/// CPU Backend identifier +#define QNN_BACKEND_ID_CPU 3 + +/// CPU interface provider +#define QNN_CPU_INTERFACE_PROVIDER_NAME "CPU_QTI_AISW" + +// CPU API Version values +#define QNN_CPU_API_VERSION_MAJOR 1 +#define QNN_CPU_API_VERSION_MINOR 1 +#define QNN_CPU_API_VERSION_PATCH 0 + +// clang-format off +/// Macro to set Qnn_ApiVersion_t for CPU backend +#define QNN_CPU_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_CPU_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_CPU_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_CPU_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +#endif // QNN_CPU_COMMON_H \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h new file mode 100755 index 0000000000000..750cfd0b501f1 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h @@ -0,0 +1,117 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN CPU component Graph API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnGraph.h for CPU backend + */ + +#ifndef QNN_CPU_GRAPH_H +#define QNN_CPU_GRAPH_H + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different CPU graph configuration + * options associated with QnnGraph + */ +typedef enum { + QNN_CPU_GRAPH_CONFIG_OPTION_OP_DEBUG_CALLBACK = 1, + QNN_CPU_GRAPH_CONFIG_OPTION_UNDEFINED = 0x7fffffff +} QnnCpuGraph_ConfigOption_t; + +/* @brief CallBack function pointer to be filled by user. + * This callback will be called after each op execution. + * Only outputTensor id and data buffer is valid, consumable. + * Memory is owned by BE which is valid throughout the callback. + * Client should not update any parameter and argument of opConfig. + * NULL tensor/buffer indicate invalid data buffer. + */ +typedef Qnn_ErrorHandle_t (*QnnCpuGraph_OpDebugCallback_t)(Qnn_OpConfig_t* opConfig, + void* callBackParam); + +/* @brief Structure to be filled by user. + * This structure will have callback function and callback reference data. + * Memory is owned by BE which is valid throughout the callback. + * Client should not update any parameter and argument of opConfig. + * NULL callback function indicate no debug option. + */ +typedef struct { + void* callBackParam; + QnnCpuGraph_OpDebugCallback_t cpuGraphOpDebugCallback; +} QnnCpuGraph_OpDebug_t; + +// clang-format off +/// QnnCpuGraph_OpDebug_t initializer macro +#define QNN_CPU_GRAPH_OP_DEBUG_INIT \ + { \ + NULL, /*callBackParam*/ \ + NULL /*cpuGraphOpDebugCallback*/ \ + } +// clang-format on + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +/** + * @brief Structure describing the set of configurations supported by graph. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + * + * The struct has two fields - option and a union of corresponding config values + * Based on the option corresponding item in the union can be used to specify + * config. + * Below is the map between QnnCpuGraph_ConfigOption_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+------------------------------------------+------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+==========================================+====================================+ + * | 1 | QNN_CPU_GRAPH_CONFIG_DEBUG_CALLBACK | QnnCpuGraph_OpDebug_t | + * +----+------------------------------------------+------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnCpuGraph_ConfigOption_t option; + union UNNAMED { + QnnCpuGraph_OpDebug_t cpuGraphOpDebug; + }; +} QnnCpuGraph_CustomConfig_t; + +/// QnnCpuGraph_CustomConfig_t initializer macro +#define QNN_CPU_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_CPU_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + QNN_CPU_GRAPH_OP_DEBUG_INIT /*cpuGraphOpDebugCallback*/ \ + } \ + } + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h new file mode 100755 index 0000000000000..97bdab8dfd3f9 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h @@ -0,0 +1,224 @@ +//============================================================================== +// +// Copyright (c) 2020-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** @file + * @brief CPU Operation Package component API + * + * Provides interface to interact with OpPackage libraries registered + * with the CPU backend. + */ + +#ifndef QNN_CPU_OP_PACKAGE_H +#define QNN_CPU_OP_PACKAGE_H + +#include "CPU/QnnCpuCommon.h" +#include "QnnGraph.h" +#include "QnnOpPackage.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define QNN_CPUOPPACKAGE_TENSOR_DATA_FORMAT_FLAT_BUFFER 0 + +/** + * @brief A value representing a tensor data format. + */ +typedef uint32_t QnnCpuOpPackage_TensorDataFormat_t; + +/** + * @brief A value representing a profile data in ms. + */ +typedef double QnnCpuOpPackage_ProfileData_t; + +/** + * @brief An enum to specify a param type. + */ +typedef enum { + QNN_CPU_PARAMTYPE_SCALAR = 0, + QNN_CPU_PARAMTYPE_TENSOR = 1, + QNN_CPU_PARAMTYPE_STRING = 2, + // Unused, present to ensure 32 bits. + QNN_CPU_PARAMTYPE_UNDEFINED = 0xFFFFFFFF +} QnnCpuOpPackage_ParamType_t; + +/** + * @brief An enum to specify tensor data type. + */ +typedef enum { + QNN_CPU_DATATYPE_BOOL_8 = 0x0508, + QNN_CPU_DATATYPE_INT_8 = 0x0008, + QNN_CPU_DATATYPE_INT_32 = 0x0032, + QNN_CPU_DATATYPE_UINT_8 = 0x0108, + QNN_CPU_DATATYPE_UINT_32 = 0x0132, + QNN_CPU_DATATYPE_FLOAT_32 = 0x0232, + // Unused, present to ensure 32 bits. + QNN_CPU_DATATYPE_UNDEFINED = 0x7FFFFFFF +} QnnCpuOpPackage_DataType_t; + +/** + * @brief An enum to specify logging level. + */ +typedef enum { + QNN_CPU_MSG_ERROR = 1, + QNN_CPU_MSG_DEBUG = 2, + QNN_CPU_MSG_LOW = 3, + QNN_CPU_MSG_MED = 4, + QNN_CPU_MSG_HIGH = 5, + // Unused, present to ensure 32 bits + QNN_CPU_MSG_UNDEFINED = 0x7FFFFFFF +} QnnCpuOpPackage_MsgType_t; + +/** + * @brief An enum to specify the profiling type. + */ +typedef enum { + QNN_CPU_PROFILE_BASIC = 1, + QNN_CPU_PROFILE_DETAILED = 2, + // Unused, present to ensure 32 bits + QNN_CPU_PROFILE_UNDEFINED = 0x7FFFFFFF +} QnnCpuOpPackage_ProfileType_t; + +/** + * @brief A struct which defines the Global infrastructure. + */ +typedef struct _QnnOpPackage_GlobalInfrastructure_t { + // Message + void (*reportMessage)(QnnCpuOpPackage_MsgType_t msgType, const char* msg, ...); + + // Profile + void (*profile)(QnnCpuOpPackage_ProfileType_t profileType, + QnnCpuOpPackage_ProfileData_t timeInMsec); +} QnnCpuOpPackage_GlobalInfra_t; + +// clang-format off +/// QnnCpuOpPackage_GlobalInfra_t initializer macro +#define QNN_CPU_OP_PACKAGE_GLOBAL_INFRA_INIT \ + { \ + NULL, /*reportMessage*/ \ + NULL /*profile*/ \ + } +// clang-format on + +typedef Qnn_ErrorHandle_t (*QnnCpuOpPackage_OpImplFn_t)(void* opPkgNodeData); + +/** + * @brief A struct which defines the OpImpl definition. + */ +typedef struct _QnnOpPackage_OpImpl_t { + QnnCpuOpPackage_OpImplFn_t opImplFn; + void* userData; +} QnnCpuOpPackage_OpImpl_t; + +// clang-format off +/// QnnCpuOpPackage_OpImpl_t initializer macro +#define QNN_CPU_OP_PACKAGE_OPIMPL_INIT \ + { \ + NULL, /*kernelFn*/ \ + NULL /*userData*/ \ + } +// clang-format on + +/** + * @brief A struct which describes the properties of a tensor. + * + */ +typedef struct { + QnnCpuOpPackage_TensorDataFormat_t dataFormat; + QnnCpuOpPackage_DataType_t dataType; + uint32_t rank; + uint32_t* maxDimensions; + uint32_t* currentDimensions; + void* data; + Qnn_QuantizeParams_t quantizeParams; +} QnnCpuOpPackage_Tensor_t; + +// clang-format off +/// QnnCpuOpPackage_Tensor_t initializer macro +#define QNN_CPU_OP_PACKAGE_TENSOR_INIT \ + { \ + QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, /*dataFormat*/ \ + QNN_CPU_DATATYPE_UNDEFINED, /*dataType*/ \ + 0, /*rank*/ \ + NULL, /*maxDimensions*/ \ + NULL, /*currentDimensions*/ \ + NULL, /*data*/ \ + QNN_QUANTIZE_PARAMS_INIT /*quantizeParams*/ \ + } +// clang-format on + +/** + * @brief A struct which describes the parameters of a node. + * + */ +typedef struct { + QnnCpuOpPackage_ParamType_t type; + const char* name; + union { + double scalarParam; + const char* string; + QnnCpuOpPackage_Tensor_t* tensorParam; + }; +} QnnCpuOpPackage_Param_t; + +// clang-format off +/// QnnCpuOpPackage_Param_t initializer macro +#define QNN_CPU_OP_PACKAGE_PARAM_INIT \ + { \ + QNN_CPU_PARAMTYPE_UNDEFINED, /*type*/ \ + NULL, /*name*/ \ + { \ + 0 /*scalarParam*/ \ + } \ + } +// clang-format on + +/** + * @brief A struct which describes the node. + * + */ +typedef struct _QnnOpPackage_Node_t { + const char* name; + const char* packageName; + const char* typeName; + uint32_t numOfParams; + QnnCpuOpPackage_Param_t** params; + uint32_t numOfInputs; + QnnCpuOpPackage_Tensor_t** inputs; + uint32_t numOfOutputs; + QnnCpuOpPackage_Tensor_t** outputs; +} QnnCpuOpPackage_Node_t; + +// clang-format off +/// QnnCpuOpPackage_Node_t initializer macro +#define QNN_CPU_OP_PACKAGE_NODE_INIT \ + { \ + NULL, /*name*/ \ + NULL, /*packageName*/ \ + NULL, /*typeName*/ \ + 0, /*numOfParams*/ \ + NULL, /*params*/ \ + 0, /*numOfInputs*/ \ + NULL, /*inputs*/ \ + 0, /*numOfOutputs*/ \ + NULL /*outputs*/ \ + } +// clang-format on + +/** + * @brief Graph infrastructure. + * + */ +typedef _QnnOpPackage_GraphInfrastructure_t QnnCpuOpPackage_GraphInfrastructure_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // QNN_CPU_OP_PACKAGE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h new file mode 100755 index 0000000000000..e2b6c69dffbdf --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h @@ -0,0 +1,108 @@ +//============================================================================= +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN DSP component Backend API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnBackend.h for DSP backend + */ + +#ifndef QNN_DSP_BACKEND_H +#define QNN_DSP_BACKEND_H + +#include "QnnBackend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/* @brief Enum describing the set of custom configs supported by DSP backend. +*/ +typedef enum { + /// The accelerator will always attempt to fold relu activation + /// into the immediate preceding convolution operation. This optimization + /// is correct when quantization ranges for convolution are equal or + /// subset of the Relu operation. For graphs, where this cannot be + /// guaranteed, the client should set this option to true + QNN_DSP_BACKEND_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 0, + /// The accelerator will always attempt to all Convolution + /// operation using HMX instructions. Convolution that have + /// short depth and/or weights that are not symmetric could + /// exhibit inaccurate results. In such cases, clients must + /// set this option to true to guarantee correctness of the operation + QNN_DSP_BACKEND_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF = 1, + /// Every APP side user process that uses a DSP via FastRPC + /// has a corresponding dynamic user process domain on the DSP side. + /// QNN by default opens RPC session as unsigned PD, + /// in case this option is set to true, + /// RPC session will be opened as signed PD (requires signed .so). + QNN_DSP_BACKEND_CONFIG_OPTION_USE_SIGNED_PROCESS_DOMAIN = 2, + /// set QnnDspBackend_DspArch_t for offline prepare mode + QNN_DSP_BACKEND_CONFIG_OPTION_ARCH = 3, + /// UNKNOWN enum option that must not be used + QNN_DSP_BACKEND_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnDspBackend_ConfigOption_t; + +typedef enum { + QNN_DSP_BACKEND_DSP_ARCH_NONE = 0, + QNN_DSP_BACKEND_DSP_ARCH_V65 = 65, + QNN_DSP_BACKEND_DSP_ARCH_V66 = 66, + QNN_DSP_BACKEND_DSP_ARCH_V68 = 68, + QNN_DSP_BACKEND_DSP_ARCH_V69 = 69, + QNN_DSP_BACKEND_DSP_ARCH_V73 = 73, + QNN_DSP_BACKEND_DSP_ARCH_UNKNOWN = 0x7fffffff +} QnnDspBackend_DspArch_t; + +/** + * @brief Structure describing the set of configurations supported by the backend. + * Objects of this type are to be referenced through QnnBackend_CustomConfig_t. + */ +typedef struct QnnDspBackend_CustomConfig { + QnnDspBackend_ConfigOption_t option; + union UNNAMED { + bool foldReluActivationIntoConvOff; + bool shortDepthConvOnHmxOff; + bool useSignedProcessDomain; + QnnDspBackend_DspArch_t arch; + }; +} QnnDspBackend_CustomConfig_t ; + +/// QnnDspBackend_CustomConfig_t initializer macro +#define QNN_DSP_BACKEND_CUSTOM_CONFIG_INIT \ + { \ + QNN_DSP_BACKEND_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + false /*foldReluActivationIntoConvOff*/ \ + } \ + } + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h new file mode 100755 index 0000000000000..8b5ad49d04d6e --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h @@ -0,0 +1,61 @@ +//============================================================================= +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN DSP Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for DSP backend + */ + +#ifndef QNN_DSP_COMMON_H +#define QNN_DSP_COMMON_H + +#include "QnnCommon.h" + +/// DSP Backend identifier +#define QNN_BACKEND_ID_DSP 5 + +/// DSP interface provider +#define QNN_DSP_INTERFACE_PROVIDER_NAME "DSP_QTI_AISW" + +// DSP API Version values +#define QNN_DSP_API_VERSION_MAJOR 5 +#define QNN_DSP_API_VERSION_MINOR 0 +#define QNN_DSP_API_VERSION_PATCH 1 + +// clang-format off + +/// Macro to set Qnn_ApiVersion_t for DSP backend +#define QNN_DSP_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_DSP_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_DSP_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_DSP_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +// DSP Binary Version values +#define QNN_DSP_BINARY_VERSION_MAJOR 1 +#define QNN_DSP_BINARY_VERSION_MINOR 0 +#define QNN_DSP_BINARY_VERSION_PATCH 0 + +// DSP Context blob Version values +#define QNN_DSP_CONTEXT_BLOB_VERSION_MAJOR 1 +#define QNN_DSP_CONTEXT_BLOB_VERSION_MINOR 0 +#define QNN_DSP_CONTEXT_BLOB_VERSION_PATCH 0 + +#endif // QNN_DSP_COMMON_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h new file mode 100755 index 0000000000000..eecf62f5cbc02 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h @@ -0,0 +1,46 @@ +//============================================================================= +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN DSP component Device API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnDevice.h for DSP backend + */ +#ifndef QNN_DSP_DEVICE_H +#define QNN_DSP_DEVICE_H + +#include "QnnDevice.h" +#include "QnnDspPerfInfrastructure.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _QnnDevice_Infrastructure_t { + QnnDspPerfInfrastructure_CreatePowerConfigIdFn_t createPowerConfigId; + QnnDspPerfInfrastructure_DestroyPowerConfigIdFn_t destroyPowerConfigId; + QnnDspPerfInfrastructure_SetPowerConfigFn_t setPowerConfig; + QnnDspPerfInfrastructure_SetMemoryConfigFn_t setMemoryConfig; + QnnDspPerfInfrastructure_SetThreadConfigFn_t setThreadConfig; +} QnnDspDevice_Infrastructure_t; + +#define QNN_DSP_DEVICE_INFRASTRUCTURE_INIT \ + { \ + NULL, /*createPowerConfigId*/ \ + NULL, /*destroyPowerConfigId*/ \ + NULL, /*setPowerConfig*/ \ + NULL, /*setMemoryConfig*/ \ + NULL /*setThreadConfig*/ \ + } + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h new file mode 100755 index 0000000000000..dd1c5220c8721 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h @@ -0,0 +1,171 @@ +//============================================================================= +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** + * @file + * @brief QNN DSP component Graph API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnGraph.h for DSP backend + */ + +#ifndef QNN_DSP_GRAPH_H +#define QNN_DSP_GRAPH_H + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different DSP graph optimization + * options that can be used to finalize the graph + * for optimum performance. + */ +typedef enum { + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD = 1, + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES = 2, + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG = 3, + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC = 4, + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN = 0x7fffffff +} QnnDspGraph_OptimizationType_t; + +// clang-format off + +/** + * @brief Struct describing the set of optimization types + * and the values associated with each optimization type. + * + * Below is the Map between QnnDspGraph_OptimizationType_t and allowable values: + * + * \verbatim embed:rst:leading-asterisk + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * | # | OptimizationType option | Allowable values | + * +====+============================================================+===========================================================+ + * | 1 | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD | Reserved | + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * | 2 | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES | Reserved | + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * | 3 | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG | Defines the optimization strategy used by the HTP backend | + * | | | | + * | | | 1 = Faster preparation time, less optimal graph | + * | | | | + * | | | 2 = More optimal graph but may take longer to prepare | + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * | 4 | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC | Reserved | + * +----+------------------------------------------------------------+-----------------------------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnDspGraph_OptimizationType_t type; + float floatValue; +} QnnDspGraph_OptimizationOption_t; + +/// QnnDspGraph_OptimizationOption_t initializer macro +#define QNN_DSP_GRAPH_OPTIMIZATION_OPTION_INIT \ + { \ + QNN_DSP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/ \ + 0.0f /*floatValue*/ \ + } +// clang-format on + +/** + * @brief This enum provides different DSP graph configuration + * options associated with QnnGraph + */ +typedef enum { + QNN_DSP_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1, + QNN_DSP_GRAPH_CONFIG_OPTION_ENCODING = 2, + QNN_DSP_GRAPH_CONFIG_OPTION_PRIORITY = 3, + QNN_DSP_GRAPH_CONFIG_OPTION_PRECISION = 4, + QNN_DSP_GRAPH_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnDspGraph_ConfigOption_t; + +typedef enum { + QNN_DSP_GRAPH_ENCODING_DYNAMIC = 1, + /** @deprecated + */ + QNN_DSP_GRAPH_ENCOING_DYNAMIC = QNN_DSP_GRAPH_ENCODING_DYNAMIC, + QNN_DSP_GRAPH_ENCODING_STATIC = 2, + /** @deprecated + */ + QNN_DSP_GRAPH_ENCOING_STATIC = QNN_DSP_GRAPH_ENCODING_STATIC, + QNN_DSP_GRAPH_ENCODING_UNKNOWN = 0x7fffffff, + /** @deprecated + */ + QNN_DSP_GRAPH_ENCOING_UNKNOW = QNN_DSP_GRAPH_ENCODING_UNKNOWN +} QnnDspGraph_Encoding_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/** + * @brief Structure describing the set of configurations supported by graph. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + * + * The struct has two fields - option and a union of corresponding config values + * Based on the option corresponding item in the union can be used to specify + * config. + * + * Below is the Map between QnnDspGraph_ConfigOption_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+------------------------------------------+------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+==========================================+====================================+ + * | 1 | QNN_DSP_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnDspGraph_OptimizationOption_t | + * +----+------------------------------------------+------------------------------------+ + * | 2 | QNN_DSP_GRAPH_CONFIG_OPTION_ENCODING | QnnDspGraph_Encoding_t | + * +----+------------------------------------------+------------------------------------+ + * | 3 | QNN_DSP_GRAPH_CONFIG_OPTION_PRECISION | Qnn_Precision_t | + * +----+------------------------------------------+------------------------------------+ + * | 4 | QNN_DSP_GRAPH_CONFIG_OPTION_PRIORITY | Qnn_Priority_t | + * +----+------------------------------------------+------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnDspGraph_ConfigOption_t option; + union { + QnnDspGraph_OptimizationOption_t optimizationOption; + QnnDspGraph_Encoding_t encoding; + Qnn_Priority_t priority; + Qnn_Precision_t precision; + }; +} QnnDspGraph_CustomConfig_t; + +// clang-format on +/// QnnDspGraph_CustomConfig_t initializer macro +#define QNN_DSP_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_DSP_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + QNN_DSP_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \ + } \ + } + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h new file mode 100755 index 0000000000000..c8760ecb6b798 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h @@ -0,0 +1,42 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef QNN_DSP_OP_PACKAGE_HPP +#define QNN_DSP_OP_PACKAGE_HPP + +#include "QnnOpPackage.h" +#include "QnnTypes.h" +#include "Udo/UdoImplDsp.h" + +/** + * @brief A struct which defines the Global infrastructure. + */ +typedef struct _QnnOpPackage_GlobalInfrastructure_t { + /// include the UdoMalloc, UdoFree and so on + Udo_DspGlobalInfrastructure_t* dspGlobalInfra; +} QnnDspOpPackage_GlobalInfrastructure_t; + +/** + * @brief A struct which defines the operation info. + */ +typedef struct _QnnOpPackage_OperationInfo_t { + char* opType; + uint32_t numOfStaticParams; + uint32_t numOfInputs; + uint32_t numOfOutputs; + + Udo_CreateOpFactoryFunction_t createOpFactory; + Udo_CreateOperationFunction_t createOperation; + Udo_ExecuteOpFunction_t executeOp; + Udo_ReleaseOpFunction_t releaseOp; + Udo_ReleaseOpFactoryFunction_t releaseOpFactory; + Udo_ValidateOperationFunction_t validateOp; + Udo_QueryOperationFunction_t queryOp; +} QnnDspOpPackage_OperationInfo_t; + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h new file mode 100755 index 0000000000000..c9b1aa3020b9e --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h @@ -0,0 +1,448 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** @file + * @brief QNN DSP component Performance Infrastructure API + * + * Provides interface to the client to control performance and system + * settings of the QNN DSP Accelerator + */ + +#ifndef QNN_DSP_PERF_INFRASTRUCTURE_H +#define QNN_DSP_PERF_INFRASTRUCTURE_H + +#include "QnnCommon.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// max rpc polling time allowed - 9999 us +#define QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME 9999 + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief QNN DSP PerfInfrastructure API result / error codes. + * + */ +typedef enum { + QNN_DSP_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE, + //////////////////////////////////////////////////////////////////////// + + QNN_DSP_PERF_INFRASTRUCTURE_NO_ERROR = QNN_SUCCESS, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 4, + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_FAILED = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 5, + + //////////////////////////////////////////////////////////////////////// + QNN_DSP_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE, + /// UNDEFINED value that must not be used by client + QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNDEFINED = 0x7fffffff +} QnnDspPerfInfrastructure_Error_t; + +/** + * @brief Used to allow client start (non-zero value) or stop participating + * (zero value) in DCVS + * + */ +typedef uint32_t QnnDspPerfInfrastructure_DcvsEnable_t; + +/** + * @brief Allows client to set up the sleep latency in microseconds + * + */ +typedef uint32_t QnnDspPerfInfrastructure_SleepLatency_t; + +/** + * @brief Allows client to disable sleep or low power modes. + * Pass a non-zero value to disable sleep in DSP + * + */ +typedef uint32_t QnnDspPerfInfrastructure_SleepDisable_t; + +/** + * @brief sets the minimum size by which user heap should grow + * when heap is exhausted. This API is expected to be + * called only once per backend and has a process wide impact + * + * Grow size provided in bytes and defaults to 16MB + */ +typedef uint32_t QnnDspPerfInfrastructure_MemGrowSize_t; + +/** + * @brief sets the vtcm size to use for graphs that + * are prepared offline. This API should be set up + * before users can finalize a graph offline. It allows + * the QNN DSP backend to configure the serialized + * context for the available vtcm on target + * + * VTCM size provided in MB and does not have a default + */ +typedef uint32_t QnnDspPerfInfrastructure_VtcmSize_t; + +/** + * @brief sets the number of HVX threads for QNN DSP + */ +typedef uint32_t QnnDspPerfInfrastructure_HvxThreadNumber_t; + +/** + * @brief These are the different voltage corners that can + * be requested by the client to influence the voting scheme + * for DCVS + * + */ +typedef enum { + /// Maps to HAP_DCVS_VCORNER_DISABLE. + /// Disable setting up voltage corner + DCVS_VOLTAGE_CORNER_DISABLE = 0x10, + /// Maps to HAP_DCVS_VCORNER_SVS2. + /// Set voltage corner to minimum value supported on platform + DCVS_VOLTAGE_VCORNER_MIN_VOLTAGE_CORNER = 0x20, + /// Maps to HAP_DCVS_VCORNER_SVS2. + /// Set voltage corner to SVS2 value for the platform + DCVS_VOLTAGE_VCORNER_SVS2 = 0x30, + /// Maps to HAP_DCVS_VCORNER_SVS. + /// Set voltage corner to SVS value for the platform + DCVS_VOLTAGE_VCORNER_SVS = 0x40, + /// Maps to HAP_DCVS_VCORNER_SVS_PLUS. + /// Set voltage corner to SVS_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_SVS_PLUS = 0x50, + /// Maps to HAP_DCVS_VCORNER_NOM. + /// Set voltage corner to NOMINAL value for the platform + DCVS_VOLTAGE_VCORNER_NOM = 0x60, + /// Maps to HAP_DCVS_VCORNER_NOM_PLUS. + /// Set voltage corner to NOMINAL_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_NOM_PLUS = 0x70, + /// Maps to HAP_DCVS_VCORNER_TURBO. + /// Set voltage corner to TURBO value for the platform + DCVS_VOLTAGE_VCORNER_TURBO = 0x80, + /// Maps to HAP_DCVS_VCORNER_TURBO_PLUS. + /// Set voltage corner to TURBO_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_TURBO_PLUS = 0x90, + /// Maps to HAP_DCVS_VCORNER_MAX. + /// Set voltage corner to maximum value supported on the platform + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER = 0xA0, + /// UNKNOWN value that must not be used by client + DCVS_VOLTAGE_VCORNER_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_VoltageCorner_t; + +/** + * @brief This enum defines all the possible power mode + * that a client can set to influence DCVS mode + */ +typedef enum { + /// Maps to HAP_DCVS_V2_ADJUST_UP_DOWN. + /// Allows for DCVS to adjust up and down + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN = 0x1, + /// Maps to HAP_DCVS_V2_ADJUST_ONLY_UP. + /// Allows for DCVS to adjust up only + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_ONLY_UP = 0x2, + /// Maps to HAP_DCVS_V2_POWER_SAVER_MODE. + /// Higher thresholds for power efficiency + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE = 0x4, + /// Maps to HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE. + /// Higher thresholds for power efficiency with faster ramp down + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_AGGRESSIVE_MODE = 0x8, + /// Maps to HAP_DCVS_V2_PERFORMANCE_MODE. + /// Lower thresholds for maximum performance + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE = 0x10, + /// Maps to HAP_DCVS_V2_DUTY_CYCLE_MODE. + /// The below value applies only for HVX clients: + /// - For streaming class clients: + /// - detects periodicity based on HVX usage + /// - lowers clocks in the no HVX activity region of each period. + /// - For compute class clients: + /// - Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity + /// again. + /// - Latency involved in bringing up the clock will be at max 1 to 2 ms. + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_DUTY_CYCLE_MODE = 0x20, + /// UNKNOWN value that must not be used by client + QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_PowerMode_t; + +/** + * @brief This enum defines all the possible performance + * options in Dsp Performance Infrastructure that + * relate to setting up of power levels + */ +typedef enum { + /// config enum implies the usage of dcvsEnableConfig struct. For dcvs v2, if not provided, will + /// set to false + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_ENABLE = 1, + /// config enum implies the usage of sleepLatencyConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_SLEEP_LATENCY = 2, + /// config enum implies the usage of sleepDisableConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_SLEEP_DISABLE = 3, + /// config enum implies the usage of dcvsPowerModeConfig struct. If not provided, power save mode + /// will be used + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_POWER_MODE = 4, + /// config enum implies the usage of dcvsVoltageCornerConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_VOLTAGE_CORNER = 5, + /// config enum implies the usage of busVoltageCornerConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_BUS_VOLTAGE_CORNER = 6, + /// config enum implies the usage of coreVoltageCornerConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_CORE_VOLTAGE_CORNER = 7, + /// config enum implies the usage of rpcControlLatencyConfig struct + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY = 9, + /// config enum implies the usage of rpcPollingTimeConfig struct + /// this config is only supported on V69 and later + /// if enabled, this config is applied to entire process + /// max allowed is QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME us + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME = 10, + /// config HMX timeout interval in us. The HMX is turned off after the set interval + /// time if no interaction with it after an inference is finished. + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_TIMEOUT_INTERVAL_US = 11, + /// UNKNOWN config option which must not be used + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_PowerConfigOption_t; + +/** + * @brief Allows client to set up the RPC control latency in microseconds + * + */ +typedef uint32_t QnnDspPerfInfrastructure_RpcControlLatency_t; + +/** + * @brief Allows client to set up the RPC polling time in microseconds + */ +typedef uint32_t QnnDspPerfInfrastructure_RpcPollingTime_t; + +/** + * @brief Allows client to set up the HMX timeout interval in microseconds + */ +typedef uint32_t QnnDspPerfInfrastructure_HmxTimeoutIntervalUs_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of power levels + */ +typedef struct { + QnnDspPerfInfrastructure_PowerConfigOption_t config; + union { + QnnDspPerfInfrastructure_DcvsEnable_t dcvsEnableConfig; + QnnDspPerfInfrastructure_SleepLatency_t sleepLatencyConfig; + QnnDspPerfInfrastructure_SleepDisable_t sleepDisableConfig; + QnnDspPerfInfrastructure_PowerMode_t dcvsPowerModeConfig; + QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerMinConfig; + QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerTargetConfig; + QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerMaxConfig; + QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerMinConfig; + QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerTargetConfig; + QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerMaxConfig; + QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerMinConfig; + QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerTargetConfig; + QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerMaxConfig; + QnnDspPerfInfrastructure_RpcControlLatency_t rpcControlLatencyConfig; + QnnDspPerfInfrastructure_RpcPollingTime_t rpcPollingTimeConfig; + QnnDspPerfInfrastructure_HmxTimeoutIntervalUs_t hmxTimeoutIntervalUsConfig; + }; +} QnnDspPerfInfrastructure_PowerConfig_t; + +/// QnnDspPerfInfrastructure_PowerConfig_t initializer macro +#define QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT \ + { \ + QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*dcvsEnableConfig*/ \ + } \ + } + +/** + * @brief This enum defines all the possible performance + * options in Dsp Performance Infrastructure that + * relate to system memory settings + */ +typedef enum { + /// sets memory grow size + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE = 1, + /// set the size of VTCM configuration (in MB) to use + /// This setting is applicable only for off target usage. + /// For on-target usage, refer QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_USAGE_FACTOR + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_SIZE = 2, + /// set the vtcm usage factor on-target + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_USAGE_FACTOR = 3, + /// UNKNOWN config option that must not be used + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_MemoryConfigOption_t; + +/** + * @brief This enum defines all the possible performance + * options in Dsp Performance Infrastructure that + * relate to thread settings + */ +typedef enum { + /// sets number of HVX threads + QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_NUMBER_OF_HVX_THREADS = 1, + /// UNKNOWN config option that must not be used + QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_ThreadConfigOption_t; + +/** + * @brief This enum defines all the possible vtcm + * usage configuration. These settings apply only + * for on-target libraries + * + */ +typedef enum { + /// use all the vtcm available on target + QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_FULL = 1, + /// use bare minimal vtcm available on target. This is + /// not supported in the current release. + QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_MIN = 2, + QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_UNKNOWN = 0x7fffffff +} QnnDspPerfInfrastructure_VtcmUsageFactor_t; + +/** + * @brief Provides performance infrastructure configuration + * options that are memory specific + */ +typedef struct { + QnnDspPerfInfrastructure_MemoryConfigOption_t config; + union { + QnnDspPerfInfrastructure_MemGrowSize_t memGrowSizeConfig; + QnnDspPerfInfrastructure_VtcmSize_t vtcmSizeInMB; + QnnDspPerfInfrastructure_VtcmUsageFactor_t vtcmUsageConfig; + }; +} QnnDspPerfInfrastructure_MemoryConfig_t; + +/// QnnDspPerfInfrastructure_MemoryConfig_t initializer macro +#define QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIG_INIT \ + { \ + QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*memGrowSizeConfig*/ \ + } \ + } + +/** + * @brief Provides performance infrastructure configuration + * options that are thread specific + */ +typedef struct { + QnnDspPerfInfrastructure_ThreadConfigOption_t config; + union { + QnnDspPerfInfrastructure_HvxThreadNumber_t numHvxThreads; + }; +} QnnDspPerfInfrastructure_ThreadConfig_t; + +/// QnnDspPerfInfrastructure_ThreadConfig_t initializer macro +#define QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIG_INIT \ + { \ + QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*numHvxThreads*/ \ + } \ + } + +//============================================================================= +// API Methods +//============================================================================= + +/** + * @brief This API allows client to create power configuration id that + * has to be used to set different performance modes. + * Power configuration id has to be destroyed by client when not needed. + * + * @param[out] powerConfigId Pointer to power configuration id to be created. + * + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * id is NULL + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_CreatePowerConfigIdFn_t)( + uint32_t* powerConfigId); + +/** + * @brief This API allows client to destroy power configuration id. + * + * @param[in] powerConfigId A power configuration id to be destroyed. + * + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * id does not exist + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_DestroyPowerConfigIdFn_t)( + uint32_t powerConfigId); + +/** + * @brief This API allows client to set up system power configuration that + * will enable different performance modes. This API uses + * HAP_power_dcvs_v3_payload struct to config HAP power parameters. + * Detailed HAP power parameters description please refer to Hexagon + * SDK HAP_power_dcvs_v3_payload documentation. + * + * @param[in] powerConfigId A power client id to associate calls to system + * power settings. A value of 0 implies NULL power client id + * and can override every other setting the user process. To + * enable power settings for multiple clients in the same + * process, use a non-zero power client id. + * + * + * @param[in] config Pointer to a NULL terminated array + * of config option for performance configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * does not exist + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetPowerConfigFn_t)( + uint32_t powerConfigId, const QnnDspPerfInfrastructure_PowerConfig_t** config); + +/** + * @brief This API allows clients to set up configuration associated with + * system memory + * + * @param[in] config Pointer to a NULL terminated array + * of config option for system memory configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetMemoryConfigFn_t)( + const QnnDspPerfInfrastructure_MemoryConfig_t** config); + +/** + * @brief This API allows clients to set up configuration for threads + * + * @param[in] config Pointer to a NULL terminated array + * of config option for thread configuration. + * NULL is allowed and indicates no config options are provided. + * + * @note This function should be called after QnnBackend_initialize and + * before Context and Graph calls + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG if invalid + * config or value passed + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if config is NULL + * \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT if unable to set the + * settings in DSP + */ +typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetThreadConfigFn_t)( + const QnnDspPerfInfrastructure_ThreadConfig_t** config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // QNN_DSP_PERF_INFRASTRUCTURE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h new file mode 100755 index 0000000000000..04c1897aa7e18 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h @@ -0,0 +1,244 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief QNN DSP Profile component API. + * + * Requires DSP backend to be initialized. + * Should be used with the QnnProfile API but has DSP backend + * specific definition for different QnnProfile data structures + * + */ + +#ifndef QNN_DSP_PROFILE_H +#define QNN_DSP_PROFILE_H + +#include "QnnProfile.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_RPC_TIME_MICROSEC 1002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the DSP processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary dsp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_DSP_RPC_TIME_MICROSEC 1003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to create the context on the + * accelerator when client invokes QnnContext_createFromBinary. + * The value returned is time in microseconds. + * + * @note context load binary accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_RPC_TIME_MICROSEC 2001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the DSP processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize dsp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_DSP_RPC_TIME_MICROSEC 2002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to finalize the graph on the accelerator + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_RPC_TIME_MICROSEC 3001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the DSP processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute dsp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_DSP_RPC_TIME_MICROSEC 3002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is number of processor cycles taken. + * + * @note graph execute accelerator time maybe available only on + * QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for miscellaneous work i.e. time + * that cannot be attributed to a node but are still needed to + * execute the graph on the accelerator. This occurs when client invokes + * QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute misc accelerator time is available only on + * QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for a graph yield instance to + * release all its resources to the other graph. + * The value returned is time taken in microseconds. + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RELEASE_TIME 3006 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time a graph spends waiting for a higher + * priority graph to finish execution. + * The value returned is time taken in microseconds + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_WAIT_TIME 3007 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time a graph spends re-acquiring resources + * and restoring vtcm. + * The value returned is time taken in microseconds + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RESTORE_TIME 3008 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the number of times that a yield occured + * during execution + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_COUNT 3009 + +/** + * @brief QnnProfile_EventType_t definition for time a graph waits to get + * VTCM. This should be constant UNLESS we need another graph to yield. + * The value returned is time taken in microseconds. + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_VTCM_ACQUIRE_TIME 3010 + +/** + * @brief QnnProfile_EventType_t definition for time a graph waits to get + * HMX + HVX, and turn them all on. + * The value returned is time taken in microseconds. + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_RESOURCE_POWER_UP_TIME 3011 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_RPC_TIME_MICROSEC 4001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the DSP processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit dsp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_DSP_RPC_TIME_MICROSEC 4002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to deinit graph on the + * accelerator when client invokes QnnContext_free which in consequence + * deinit graph. The value returned is time in microseconds. + * + * @note graph deinit accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003 + +#ifdef __cplusplus +} +#endif + +#endif // QNN_DSP_PROFILE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h new file mode 100755 index 0000000000000..39669338e35f8 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h @@ -0,0 +1,30 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef QNN_DSP_PROPERTY_H +#define QNN_DSP_PROPERTY_H + +#include "QnnProperty.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief Property key for determining whether a backend supports unsigned pd. + */ +#define QNN_PROPERTY_CUSTOM_DSP_UNSIGNED_PD_SUPPORT QNN_PROPERTY_GROUP_CUSTOM + 1 + +#ifdef __cplusplus +} +#endif + +#endif // QNN_DSP_PROPERTY_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h new file mode 100755 index 0000000000000..942e5997ab5ff --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h @@ -0,0 +1,509 @@ +//============================================================================== +// +// Copyright (c) 2019-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef SNPE_UDO_BASE_H +#define SNPE_UDO_BASE_H + +#include + +// Provide values to use for API version. +#define API_VERSION_MAJOR 1 +#define API_VERSION_MINOR 6 +#define API_VERSION_TEENY 0 + +/** @addtogroup c_plus_plus_apis C++ +@{ */ + +// Defines a bitmask of enum values. +typedef uint32_t SnpeUdo_Bitmask_t; +typedef SnpeUdo_Bitmask_t Udo_Bitmask_t; + +// A string of characters, rather than an array of bytes. +// Assumed to be UTF-8. +typedef char* SnpeUdo_String_t; +typedef SnpeUdo_String_t Udo_String_t; + +// The maximum allowable length of a SnpeUdo_String_t in bytes, +// including null terminator. SNPE will truncate strings longer +// than this. +#define SNPE_UDO_MAX_STRING_SIZE 1024 + +/** + * An enum which holds the various error types. + * The error types are divided to classes : + * 0 - 99 : generic errors + * 100 - 200 : errors related to configuration + * + */ +typedef enum +{ + /// No Error + SNPE_UDO_NO_ERROR = 0, UDO_NO_ERROR = 0, + /// Unsupported value for core type + SNPE_UDO_WRONG_CORE = 1, UDO_WRONG_CORE = 1, + /// Invalid attribute/argument passed into UDO API + SNPE_UDO_INVALID_ARGUMENT = 2, UDO_INVALID_ARGUMENT = 2, + /// Unsupported feature error + SNPE_UDO_UNSUPPORTED_FEATURE = 3, UDO_UNSUPPORTED_FEATURE = 3, + /// Error relating to memory allocation + SNPE_UDO_MEM_ALLOC_ERROR = 4, UDO_MEM_ALLOC_ERROR = 4, + /* Configuration Specific errors */ + /// No op with given attributes available in library + SNPE_UDO_WRONG_OPERATION = 100, UDO_WRONG_OPERATION = 100, + /// Unsupported value for core type in UDO configuration + SNPE_UDO_WRONG_CORE_TYPE = 101, UDO_WRONG_CORE_TYPE = 101, + /// Wrong number of params in UDO definition + SNPE_UDO_WRONG_NUM_OF_PARAMS = 102, UDO_WRONG_NUM_OF_PARAMS = 102, + /// Wrong number of dimensions for tensor(s) in UDO definition + SNPE_UDO_WRONG_NUM_OF_DIMENSIONS = 103, UDO_WRONG_NUM_OF_DIMENSIONS = 103, + /// Wrong number of input tensors in UDO definition + SNPE_UDO_WRONG_NUM_OF_INPUTS = 104, UDO_WRONG_NUM_OF_INPUTS = 104, + /// Wrong number of output tensors in UDO definition + SNPE_UDO_WRONG_NUM_OF_OUTPUTS = 105, UDO_WRONG_NUM_OF_OUTPUTS = 105, + SNPE_UDO_PROGRAM_CACHE_NOT_FOUND = 106, UDO_PROGRAM_CACHE_NOT_FOUND = 106, + SNPE_UDO_UNKNOWN_ERROR = 0xFFFFFFFF, UDO_UNKNOWN_ERROR = 0xFFFFFFFF +} SnpeUdo_ErrorType_t; + +typedef SnpeUdo_ErrorType_t Udo_ErrorType_t; + +/** + * An enum which holds the various data types. + * Designed to be used as single values or combined into a bitfield parameter + * (0x1, 0x2, 0x4, etc) + * \n FIXED_XX types are targeted for data in tensors. + * \n UINT / INT types are targeted for scalar params + */ +typedef enum +{ + /// data type: 16-bit floating point + SNPE_UDO_DATATYPE_FLOAT_16 = 0x01, UDO_DATATYPE_FLOAT_16 = 0x01, + /// data type: 32-bit floating point + SNPE_UDO_DATATYPE_FLOAT_32 = 0x02, UDO_DATATYPE_FLOAT_32 = 0x02, + /// data type: 4-bit fixed point + SNPE_UDO_DATATYPE_FIXED_4 = 0x04, UDO_DATATYPE_FIXED_4 = 0x04, + /// data type: 8-bit fixed point + SNPE_UDO_DATATYPE_FIXED_8 = 0x08, UDO_DATATYPE_FIXED_8 = 0x08, + /// data type: 16-bit fixed point + SNPE_UDO_DATATYPE_FIXED_16 = 0x10, UDO_DATATYPE_FIXED_16 = 0x10, + /// data type: 32-bit fixed point + SNPE_UDO_DATATYPE_FIXED_32 = 0x20, UDO_DATATYPE_FIXED_32 = 0x20, + /// data type: 8-bit unsigned integer + SNPE_UDO_DATATYPE_UINT_8 = 0x100, UDO_DATATYPE_UINT_8 = 0x100, + /// data type: 16-bit unsigned integer + SNPE_UDO_DATATYPE_UINT_16 = 0x200, UDO_DATATYPE_UINT_16 = 0x200, + /// data type: 32-bit unsigned integer + SNPE_UDO_DATATYPE_UINT_32 = 0x400, UDO_DATATYPE_UINT_32 = 0x400, + /// data type: 8-bit signed integer + SNPE_UDO_DATATYPE_INT_8 = 0x1000, UDO_DATATYPE_INT_8 = 0x1000, + /// data type: 16-bit signed integer + SNPE_UDO_DATATYPE_INT_16 = 0x2000, UDO_DATATYPE_INT_16 = 0x2000, + /// data type: 32-bit signed integer + SNPE_UDO_DATATYPE_INT_32 = 0x4000, UDO_DATATYPE_INT_32 = 0x4000, + SNPE_UDO_DATATYPE_LAST = 0xFFFFFFFF, UDO_DATATYPE_LAST = 0xFFFFFFFF +} SnpeUdo_DataType_t; + +typedef SnpeUdo_DataType_t Udo_DataType_t; + +/** + * An enum which holds the various layouts. + * Designed to be used as single values or combined into a bitfield parameter + * (0x1, 0x2, 0x4, etc) + */ +typedef enum +{ + /// data layout (4D): NHWC (batch-height-width-channel) + SNPE_UDO_LAYOUT_NHWC = 0x01, UDO_LAYOUT_NHWC = 0x01, + /// data layout (4D): NCHW (batch-channel-height-width) + SNPE_UDO_LAYOUT_NCHW = 0x02, UDO_LAYOUT_NCHW = 0x02, + /// data layout (5D): NDHWC (batch-dimension-height-width-channel) + SNPE_UDO_LAYOUT_NDHWC = 0x04, UDO_LAYOUT_NDHWC = 0x04, + SNPE_UDO_LAYOUT_GPU_OPTIMAL1 = 0x08, UDO_LAYOUT_GPU_OPTIMAL1 = 0x08, + SNPE_UDO_LAYOUT_GPU_OPTIMAL2 = 0x10, UDO_LAYOUT_GPU_OPTIMAL2 = 0x10, + SNPE_UDO_LAYOUT_DSP_OPTIMAL1 = 0x11, UDO_LAYOUT_DSP_OPTIMAL1 = 0x11, + SNPE_UDO_LAYOUT_DSP_OPTIMAL2 = 0x12, UDO_LAYOUT_DSP_OPTIMAL2 = 0x12, + // Indicates no data will be allocated for this tensor. + // Used to specify optional inputs/outputs positionally. + SNPE_UDO_LAYOUT_NULL = 0x13, UDO_LAYOUT_NULL = 0x13, + SNPE_UDO_LAYOUT_LAST = 0xFFFFFFFF, UDO_LAYOUT_LAST = 0xFFFFFFFF +} SnpeUdo_TensorLayout_t; + +typedef SnpeUdo_TensorLayout_t Udo_TensorLayout_t; + +/** + * An enum which holds the UDO library Core type . + * Designed to be used as single values or combined into a bitfield parameter + * (0x1, 0x2, 0x4, etc) + */ +typedef enum +{ + /// Library target IP Core is undefined + SNPE_UDO_CORETYPE_UNDEFINED = 0x00, UDO_CORETYPE_UNDEFINED = 0x00, + /// Library target IP Core is CPU + SNPE_UDO_CORETYPE_CPU = 0x01, UDO_CORETYPE_CPU = 0x01, + /// Library target IP Core is GPU + SNPE_UDO_CORETYPE_GPU = 0x02, UDO_CORETYPE_GPU = 0x02, + /// Library target IP Core is DSP + SNPE_UDO_CORETYPE_DSP = 0x04, UDO_CORETYPE_DSP = 0x04, + SNPE_UDO_CORETYPE_LAST = 0xFFFFFFFF, UDO_CORETYPE_LAST = 0xFFFFFFFF +} SnpeUdo_CoreType_t; + +typedef SnpeUdo_CoreType_t Udo_CoreType_t; + +/** + * An enum to specify the parameter type : Scalar or Tensor + */ +typedef enum +{ + /// UDO static param type: scalar + SNPE_UDO_PARAMTYPE_SCALAR = 0x00, UDO_PARAMTYPE_SCALAR = 0x00, + /// UDO static param type: string + SNPE_UDO_PARAMTYPE_STRING = 0x01, UDO_PARAMTYPE_STRING = 0x01, + /// UDO static param type: tensor + SNPE_UDO_PARAMTYPE_TENSOR = 0x02, UDO_PARAMTYPE_TENSOR = 0x02, + SNPE_UDO_PARAMTYPE_LAST = 0xFFFFFFFF, UDO_PARAMTYPE_LAST = 0xFFFFFFFF +} SnpeUdo_ParamType_t; + +typedef SnpeUdo_ParamType_t Udo_ParamType_t; + +/** + * An enum to specify quantization type + */ +typedef enum +{ + /// Tensor Quantization type: NONE. Signifies unquantized tensor data + SNPE_UDO_QUANTIZATION_NONE = 0x00, UDO_QUANTIZATION_NONE = 0x00, + /// Tensor Quantization type: Tensorflow-style + SNPE_UDO_QUANTIZATION_TF = 0x01, UDO_QUANTIZATION_TF = 0x01, + SNPE_UDO_QUANTIZATION_QMN = 0x02, UDO_QUANTIZATION_QMN = 0x02, + SNPE_UDO_QUANTIZATION_LAST = 0xFFFFFFFF, UDO_QUANTIZATION_LAST = 0xFFFFFFFF +} SnpeUdo_QuantizationType_t; + +typedef SnpeUdo_QuantizationType_t Udo_QuantizationType_t; + +/** + * @brief A struct which is used to provide a version number using 3 values : major, minor, teeny + * + */ +typedef struct +{ + /// version field: major - for backward-incompatible changes + uint32_t major; + /// version field: minor - for backward-compatible feature updates + uint32_t minor; + /// version field: teeny - for minor bug-fixes and clean-up + uint32_t teeny; +} SnpeUdo_Version_t; + +typedef SnpeUdo_Version_t Udo_Version_t; + +/** + * @brief A struct returned from version query, contains the Library version and API version + * + */ +typedef struct +{ + /// Version of UDO library. Controlled by users + SnpeUdo_Version_t libVersion; + /// Version of SNPE UDO API used in compiling library. Determined by SNPE + SnpeUdo_Version_t apiVersion; +} SnpeUdo_LibVersion_t; + +/** + * @brief A struct returned from version query, contains the package version + * + */ +typedef struct +{ + /// Version of UDO API used in package. + Udo_Version_t apiVersion; +} Udo_PkgVersion_t; + +/** + * @brief A union to hold the value of a generic type. Allows defining a parameter struct + * in a generic way, with a "value" location that holds the data regardless of the type. + * + */ +typedef union +{ + /// value type: float + float floatValue; + /// value type: unsigned 32-bit integer + uint32_t uint32Value; + /// value type: signed 32-bit integer + int32_t int32Value; + /// value type: unsigned 16-bit integer + uint16_t uint16Value; + /// value type: signed 16-bit integer + int16_t int16Value; + /// value type: unsigned 8-bit integer + uint8_t uint8Value; + /// value type: signed 8-bit integer + int8_t int8Value; +} SnpeUdo_Value_t; + +typedef SnpeUdo_Value_t Udo_Value_t; + +/** + * @brief A struct which defines a scalar parameter : name, data type, and union of values + * + */ +typedef struct +{ + /// The parameter data type : float, int, etc. + SnpeUdo_DataType_t dataType; + /// a union of specified type which holds the data + SnpeUdo_Value_t dataValue; +} SnpeUdo_ScalarParam_t; + +typedef SnpeUdo_ScalarParam_t Udo_ScalarParam_t; + +/** + * @brief A struct which defines the quantization parameters in case of Tensorflow style quantization + * + */ +typedef struct +{ + /// minimum value of the quantization range of data + float minValue; + /// maximum value of the quantization range of data + float maxValue; +} SnpeUdo_TFQuantize_t; + +typedef SnpeUdo_TFQuantize_t Udo_TFQuantize_t; + +/** + * @brief A struct which defines the quantization type, and union of supported quantization structs + * + */ +typedef struct +{ + /// quantization type (only TF-style currently supported) + SnpeUdo_QuantizationType_t quantizeType; + union + { + /// TF-style min-max quantization ranges + SnpeUdo_TFQuantize_t TFParams; + }; +} SnpeUdo_QuantizeParams_t; + +typedef SnpeUdo_QuantizeParams_t Udo_QuantizeParams_t; + +/** + * @brief A struct which defines the datatype associated with a specified core-type + * This should be used to denote the datatypes for a single tensor info, depending + * on the intended execution core. + * + */ +typedef struct +{ + /// The IP Core + SnpeUdo_CoreType_t coreType; + /// The associated datatype for this coreType + SnpeUdo_DataType_t dataType; +} SnpeUdo_PerCoreDatatype_t; + +typedef SnpeUdo_PerCoreDatatype_t Udo_PerCoreDatatype_t; + +/** + * @brief A struct which defines a tensor parameter : name, data type, layout, quantization, more. + * Also holds a pointer to the tensor data. + * + */ +typedef struct +{ + /// The maximum allowable dimensions of the tensor. The memory held in + /// _tensorData_ is guaranteed to be large enough for this. + uint32_t* maxDimensions; + /// The current dimensions of the tensor. An operation may modify the current + /// dimensions of its output, to indicate cases where the output has been + /// "resized". + /// Note that for static parameters, the current and max dimensions must + /// match. + uint32_t* currDimensions; + /// Quantization params applicable to the tensor. Currently only supports + /// Tensorflow quantization style. + SnpeUdo_QuantizeParams_t quantizeParams; + /// Number of dimensions to the tensor: 3D, 4D, etc. + uint32_t tensorRank; + /// The parameter data type: float, int, etc. + SnpeUdo_DataType_t dataType; + /// The tensor layout type: NCHW, NHWC, etc. + SnpeUdo_TensorLayout_t layout; + /// Opaque pointer to tensor data. User may be required to re-interpret the pointer + /// based on core-specific definitions. + void* tensorData; +} SnpeUdo_TensorParam_t; + +typedef SnpeUdo_TensorParam_t Udo_TensorParam_t; + +/** + * @brief struct which defines a UDO parameter - a union of scalar, tensor and string parameters + * + */ +typedef struct +{ + /// Type is scalar or tensor + SnpeUdo_ParamType_t paramType; + /// The param name, for example : "offset", "activation_type" + SnpeUdo_String_t paramName; + union + { + /// scalar param value + SnpeUdo_ScalarParam_t scalarParam; + /// tensor param value + SnpeUdo_TensorParam_t tensorParam; + /// string param value + SnpeUdo_String_t stringParam; + }; +} SnpeUdo_Param_t; + +typedef SnpeUdo_Param_t Udo_Param_t; + +/** + * @brief A struct which defines Operation information which is specific for IP core (CPU, GPU, DSP ...) + * + */ +typedef struct +{ + /// The IP Core + SnpeUdo_CoreType_t udoCoreType; + /// Bitmask, defines supported internal calculation types (like FLOAT_32, etc) + /// Based on SnpeUdo_DataType + SnpeUdo_Bitmask_t operationCalculationTypes; +} SnpeUdo_OpCoreInfo_t; + +typedef SnpeUdo_OpCoreInfo_t Udo_OpCoreInfo_t; + +/** + * @brief A struct which defines the common and core-specific Operation information + * + */ +typedef struct +{ + /// Operation type + SnpeUdo_String_t operationType; + /// A bitmask describing which IP Cores (CPU, GPU, DSP ...) support this operation + /// Translated based on SnpeUdo_CoreType + SnpeUdo_Bitmask_t supportedByCores; + /// Number of static parameters defined by the op + uint32_t numOfStaticParams; + /// Array of static parameters. Can be scalar or tensor params + SnpeUdo_Param_t* staticParams; + /// Number of input tensors this op receives + uint32_t numOfInputs; + /// Array of input tensor names to this operation + SnpeUdo_String_t* inputNames; + /// Number of output tensors this op receives + uint32_t numOfOutputs; + /// Array of output tensor names to this operation + SnpeUdo_String_t* outputNames; + /// Number of cores that the op can execute on + uint32_t numOfCoreInfo; + /// Array of per-core information entries + SnpeUdo_OpCoreInfo_t* opPerCoreInfo; +} SnpeUdo_OperationInfo_t; + +typedef SnpeUdo_OperationInfo_t Udo_OperationInfo_t; + +/** + * @brief A struct which provides the implementation library info : type, name + * + */ +typedef struct +{ + /// Defines the IP Core that this implementation library is targeting + SnpeUdo_CoreType_t udoCoreType; + /// library name. will be looked at in the standard library path + SnpeUdo_String_t libraryName; +} SnpeUdo_LibraryInfo_t; + +typedef SnpeUdo_LibraryInfo_t Udo_LibraryInfo_t; + +/** + * @brief A struct returned by the registration library and contains information on the UDO package : + * name, operations, libraries, etc. + * + */ +typedef struct +{ + /// A string containing the package name + SnpeUdo_String_t packageName; + /// A bitmask describing supported IP cores (CPU, GPU, DSP ...) + /// Translated based on SnpeUdo_CoreType + SnpeUdo_Bitmask_t supportedCoreTypes; + /// The number of implementation libraries in the package + uint32_t numOfImplementationLib; + /// Array of implementation libraries names/types + SnpeUdo_LibraryInfo_t* implementationLib; + /// A string containing all operation types separated by space + SnpeUdo_String_t operationsString; + /// Number of supported operations + uint32_t numOfOperations; + /// Array of Operation info structs. Each entry describes one + /// Operation (name, params, inputs, outputs) + SnpeUdo_OperationInfo_t* operationsInfo; +} SnpeUdo_RegInfo_t; + +typedef SnpeUdo_RegInfo_t Udo_RegInfo_t; + +/** +* @brief A struct returned by the implementation library and contains information on the +* specific library: name, IP Core, operations, etc. +* +*/ +typedef struct +{ + /// Defines the IP Core that this implementation library is targeting + SnpeUdo_CoreType_t udoCoreType; + /// A string containing the package name + SnpeUdo_String_t packageName; + /// A string containing all operation types separated by space + SnpeUdo_String_t operationsString; + /// Number of supported operations + uint32_t numOfOperations; +} SnpeUdo_ImpInfo_t; + +typedef SnpeUdo_ImpInfo_t Udo_ImpInfo_t; + +/** + * @brief This struct defines an operation. It is used for validation + * or creation of an operation. + * In case of using it for creation, the static params which are tensors + * contain pointers to the real data (weights, for example), and input/output + * tensors also include pointers to the buffers used. + */ +typedef struct +{ + /// The IP Core that the operation is defined for - CPU, GPU, DSP... + SnpeUdo_CoreType_t udoCoreType; + /// Operation type + SnpeUdo_String_t operationType; + /// The number of static parameters provided in the staticParams array. + /// this number has to match the number provided by the UDO Registration library information + uint32_t numOfStaticParams; + /// Array of static parameters + SnpeUdo_Param_t* staticParams; + /// The number of input parameters provided in inputs array. + /// this number has to match the number provided by the UDO Registration library information + uint32_t numOfInputs; + /// Array of input tensors, providing layout, data type, sizes, etc + /// When used to create an operation, also contains the initial location of the data + SnpeUdo_TensorParam_t* inputs; + /// The number of output parameters provided in inputs array. + /// this number has to match the number provided by the UDO Registration library information + uint32_t numOfOutputs; + /// Array of output tensors, providing layout, data type, sizes, etc + /// When used to create an operation, also contains the initial location of the data + SnpeUdo_TensorParam_t* outputs; +} SnpeUdo_OpDefinition_t; + +typedef SnpeUdo_OpDefinition_t Udo_OpDefinition_t; + +/** @} */ /* end_addtogroup c_plus_plus_apis C++ */ + +#endif //SNPE_UDO_BASE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h new file mode 100755 index 0000000000000..84a8fe310908e --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h @@ -0,0 +1,78 @@ +//============================================================================== +// +// Copyright (c) 2019 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#include "DSP/Udo/UdoBase.h" + +#define HVX_ALIGNMENT 128 +#define DSP_STRUCT_ALIGNMENT 8 +#define DSP_ALIGN(X, ALIGNMENT) (((X) + ALIGNMENT - 1) & (~((ALIGNMENT)-1))) + +typedef struct dspStaticParamsMeta { + uint32_t size; + uint32_t numParams; +} dspStaticParamsMeta_t; + +typedef struct tensorParamInfo { + SnpeUdo_TensorLayout_t layout; + SnpeUdo_QuantizeParams_t quantizeInfo; + SnpeUdo_DataType_t dataType; + uint32_t paddingFor8byteAlignment; +} tensorParamInfo_t; + +typedef struct udoString { + uint32_t sizeStruct; // aligned + uint32_t lengthString; // does not include null character + // followed by a string +} udoString_t; // allocate mem for string for 8 byte alignment + +typedef struct dims { + uint32_t size; + uint32_t rank; + uint32_t ds; // rank # of max dimensions followed by rank # of current dimensions for tensors +} dims_t; + +typedef struct tensorData { + uint32_t structSize; + uint32_t dataSize; + // followed by actual tensor data +} tensorData_t; + +typedef struct dspStaticParamDescriptor { + uint32_t size; // including size of descriptor (including dims + data for tensors) (or including string for strings) + SnpeUdo_ParamType_t paramType; + union { // not used for string data + SnpeUdo_ScalarParam_t scalarInfo; + tensorParamInfo_t tensorInfo; + }; + udoString_t name; + // followed by char* + // in case of tensor, followed by dim_stride and tensor_data + // in case of string, followed by udo_string and char* +} dspStaticParamDescriptor_t; + +typedef struct paramSizes { + uint32_t descriptorSize; + uint32_t nameStructSize; + uint32_t dimsSize; + uint32_t dataStructSize; + uint32_t dataSize; + uint32_t stringDataStructSize; +} paramSizes_t; + +typedef struct dspStaticParams { + dspStaticParamsMeta_t meta; + dspStaticParamDescriptor_t paramDesc; +} dspStaticParams_t; + + +int +SnpeUdo_flattenStaticParams (SnpeUdo_Param_t** paramList, uint32_t numParams, uint32_t* flattenedSize, void** flattened); + +void +SnpeUdo_freeFlattenedStaticParams (void** flattened); + diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h new file mode 100755 index 0000000000000..bcc767a3c4a0f --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h @@ -0,0 +1,343 @@ +//============================================================================== +// +// Copyright (c) 2019-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef SNPE_UDO_IMPL_H +#define SNPE_UDO_IMPL_H + +#include + +#include "DSP/Udo/UdoShared.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** @addtogroup c_plus_plus_apis C++ +@{ */ + +typedef struct _SnpeUdo_OpFactory_t* SnpeUdo_OpFactory_t; +typedef struct _SnpeUdo_Operation_t* SnpeUdo_Operation_t; + +typedef SnpeUdo_OpFactory_t Udo_OpFactory_t; +typedef SnpeUdo_Operation_t Udo_Operation_t; + +/** + * @brief Initialize the shared library's data structures. Calling any other + * library function before this one will result in error. + * + * @param[in] globalInfrastructure Global core-specific infrastructure to be + * used by operations created in this library. The definition and + * semantics of this object will be defined in the corresponding + * implementation header for the core type. + * @return Error code + */ +SnpeUdo_ErrorType_t +SnpeUdo_initImplLibrary(void* globalInfrastructure); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_InitImplLibraryFunction_t)(void*); + +/** + * @brief A function to query the API version of the UDO implementation library. + * The function populates a SnpeUdo_LibVersion_t struct, which contains a SnpeUdo_Version_t + * struct for API version and library version. + * + * @param[in, out] version A pointer to struct which contains major, minor, teeny information for + * library and api versions. + * + * @return Error code + */ +SnpeUdo_ErrorType_t +SnpeUdo_getImplVersion(SnpeUdo_LibVersion_t** version); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_getImplVersion_t)(SnpeUdo_LibVersion_t** version); + +/** + * @brief Release the shared library's data structures, and invalidate any + * handles returned by the library. The behavior of any outstanding + * asynchronous calls made to this library when this function is called + * are undefined. All library functions (except SnpeUdo_initImplLibrary) will + * return an error after this function has been successfully called. + * + * It should be possible to call SnpeUdo_initImplLibrary after calling this + * function, and re-initialize the library. + * + * @return Error code + */ +SnpeUdo_ErrorType_t +SnpeUdo_terminateImplLibrary(void); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_TerminateImplLibraryFunction_t)(void); + + +/** + * @brief A function to query info on the UDO implementation library. + * The function populates a structure which contains information about + * operations that are part of this library + * + * @param[in, out] implementationInfo A pointer to struct which contains information + * on the operations + * + * @return error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_getImpInfo(SnpeUdo_ImpInfo_t** implementationInfo); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_GetImpInfoFunction_t)(SnpeUdo_ImpInfo_t** implementationInfo); + +typedef SnpeUdo_GetImpInfoFunction_t Udo_GetImpInfoFunction_t; + +/** + * @brief A function to create an operation factory. + * The function receives the operation type, and an array of static parameters, + * and returns operation factory handler + * + * @param[in] udoCoreType The Core type to create the operation on. An error will + * be returned if this does not match the core type of the library. + * + * @param[in] perFactoryInfrastructure CreateOpFactory infrastructure appropriate to this + * core type. The definition and semantics of this object will be defined + * in the corresponding implementation header for the core type. + * + * @param[in] operationType A string containing Operation type. for example "MY_CONV" + * + * @param[in] numOfStaticParams The number of static parameters. + * + * @param[in] staticParams Array of static parameters + * + * @param[in,out] opFactory Handler to Operation Factory, to be used when creating operations + * + * @return Error Code + */ +SnpeUdo_ErrorType_t +SnpeUdo_createOpFactory(SnpeUdo_CoreType_t udoCoreType, + void* perFactoryInfrastructure, + SnpeUdo_String_t operationType, + uint32_t numOfStaticParams, + SnpeUdo_Param_t* staticParams, + SnpeUdo_OpFactory_t* opFactory); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_CreateOpFactoryFunction_t)(SnpeUdo_CoreType_t, + void*, + SnpeUdo_String_t, + uint32_t, + SnpeUdo_Param_t*, + SnpeUdo_OpFactory_t*); + +typedef SnpeUdo_CreateOpFactoryFunction_t Udo_CreateOpFactoryFunction_t; + +/** + * @brief A function to release the resources allocated for an operation factory + * created by this library. + * + * @param[in] opFactory The operation factory to release. Upon success this handle will be invalidated. + * + * @return Error Code + */ +SnpeUdo_ErrorType_t +SnpeUdo_releaseOpFactory(SnpeUdo_OpFactory_t opFactory); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ReleaseOpFactoryFunction_t)(SnpeUdo_OpFactory_t); + +typedef SnpeUdo_ReleaseOpFactoryFunction_t Udo_ReleaseOpFactoryFunction_t; + +/** + * @brief A function to create an operation from the factory. + * The function receives array of inputs and array of outputs, and creates an operation + * instance, returning the operation instance handler. + * + * @param[in] opFactory OpFactory instance containing the parameters for this operation. + * + * @param[in] perOpInfrastructure Per-Op infrastructure for this operation. The definition + * and semantics of this object will be defined in the implementation header + * appropriate to this core type. + * + * @param[in] numOfInputs The number of input tensors this operation will receive. + * + * @param[in] inputs Array of input tensors, providing both the sizes and initial + * location of the data. + * + * @param[in] numOfOutputs Number of output tensors this operation will produce. + * + * @param[in] outputs Array of output tensors, providing both the sizes and + * initial location of the data. + * + * @param[in,out] operation Handle for newly created operation instance. + * + * @return Error Code + */ +SnpeUdo_ErrorType_t +SnpeUdo_createOperation(SnpeUdo_OpFactory_t opFactory, + void* perOpInfrastructure, + uint32_t numOfInputs, + SnpeUdo_TensorParam_t* inputs, + uint32_t numOfOutputs, + SnpeUdo_TensorParam_t* outputs, + SnpeUdo_Operation_t* operation); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_CreateOperationFunction_t)(SnpeUdo_OpFactory_t, + void*, + uint32_t, + SnpeUdo_TensorParam_t*, + uint32_t, + SnpeUdo_TensorParam_t*, + SnpeUdo_Operation_t*); + +typedef SnpeUdo_CreateOperationFunction_t Udo_CreateOperationFunction_t; + +/** + * @brief A pointer to notification function. + * + * The notification function supports the non-blocking (e.g. asynchronous) execution use-case. + * In case an "executeUdoOp" function is called with "blocking" set to zero, and a + * notify function, this function will be called by the implementation library at the + * end of execution. The implementation library will pass the notify function the ID + * that was provided to it when "executeUdoOp" was called. + * + * @param[in] ID 32-bit value, that was provided to executeUdoOp by the calling entity. + * Can be used to track the notifications, in case of multiple execute calls issued. + * + * @return Error code + * + */ +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ExternalNotify_t)(const uint32_t ID); + +typedef SnpeUdo_ExternalNotify_t Udo_ExternalNotify_t; + +/** + * @brief Operation execution function. + * + * Calling this function will run the operation on set of inputs, generating a set of outputs. + * The call can be blocking (synchronous) or non-blocking (asynchronous). To support the + * non-blocking mode, the calling entity can pass an ID and a notification function. + * At the end of the execution this notification function would be called, passing it the ID. + * NOTE: Asynchronous execution mode not supported in this release. + * + * @param[in] operation handle to the operation on which execute is invoked + * @param[in] blocking flag to indicate execution mode. + * If set, execution is blocking, + * e.g SnpeUdo_executeOp call does not return until execution is done. + * If not set, SnpeUdo_executeOp returns immediately, and the + * library will call the notification function (if set) when execution is done. + * + * @param[in] ID 32-bit number that can be used by the calling entity to track execution + * in case of non-blocking execution. + * For example, it can be a sequence number, increased by one on each call. + * + * @param[in] notifyFunc Pointer to notification function. if the pointer is set, and execution is + * non-blocking, the library will call this function at end of execution, + * passing the number provided as ID + * + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_executeOp(SnpeUdo_Operation_t operation, + bool blocking, + const uint32_t ID, + SnpeUdo_ExternalNotify_t notifyFunc); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ExecuteOpFunction_t)(SnpeUdo_Operation_t, + bool, + const uint32_t, + SnpeUdo_ExternalNotify_t); + +typedef SnpeUdo_ExecuteOpFunction_t Udo_ExecuteOpFunction_t; + +/** + * @brief A function to setting the inputs & outputs. part of SnpeUdo_Operation struct, + * returned from creation of a new operation instance. + * Not supported in this release. + * + * This function allows the calling entity to change some of the inputs and outputs + * between calls to execute. + * Note that the change is limited to changing the pointer to the tensor data only. + * Any other change may be rejected by the implementation library, causing + * immediate invalidation of the operation instance + * + * @param[in] operation Operation on which IO tensors are set + * + * @param[in] inputs array of tensor parameters. The calling entity may provide a subset of the + * operation inputs, providing only those that it wants to change. + * + * @param[in] outputs array of tensor parameters. The calling entity may provide a subset of the + * operation outputs, providing only those that it wants to change. + * + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_setOpIO(SnpeUdo_Operation_t operation, + SnpeUdo_TensorParam_t* inputs, + SnpeUdo_TensorParam_t* outputs); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_SetOpIOFunction_t)(SnpeUdo_Operation_t, + SnpeUdo_TensorParam_t*, + SnpeUdo_TensorParam_t*); + +typedef SnpeUdo_SetOpIOFunction_t Udo_SetOpIOFunction_t; + +/** + * @brief A function to return execution times. + * + * This function can be called to query the operation execution times on the IP core + * on which the operation is run. The time is provided in micro-seconds + * + * @param[in] operation Handle to operation whose execution time is being profiled + * + * @param[in,out] executionTime pointer to a uint32 value.This function writes the operation + * execution time in usec into this value. + * + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_profileOp(SnpeUdo_Operation_t operation, uint32_t *executionTime); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ProfileOpFunction_t)(SnpeUdo_Operation_t, uint32_t*); + +typedef SnpeUdo_ProfileOpFunction_t Udo_ProfileOpFunction_t; + +/** + * @brief A function to release the operation instance + * \n When it is called, the implementation library needs to release all resources + * allocated for this operation instance. + * \n Note that all function pointers which are part of SnpeUdo_Operation become + * invalid once releaseUdoOp call returns. + * + * @param[in] operation Handle to operation to be released + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_releaseOp(SnpeUdo_Operation_t operation); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_ReleaseOpFunction_t)(SnpeUdo_Operation_t); + +typedef SnpeUdo_ReleaseOpFunction_t Udo_ReleaseOpFunction_t; + +/** @} */ /* end_addtogroup c_plus_plus_apis C++ */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif //SNPE_UDO_IMPL_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h new file mode 100755 index 0000000000000..522c6050a402d --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h @@ -0,0 +1,199 @@ +//============================================================================== +// +// Copyright (c) 2019-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +// Header to be used by a DSP Hexnn UDO Implementation library + +#ifndef SNPE_UDO_IMPL_DSP_H +#define SNPE_UDO_IMPL_DSP_H +#include +#include "DSP/Udo/UdoImpl.h" + +/** @addtogroup c_plus_plus_apis C++ +@{ */ + +/** + * @brief A function to validate that a set of params is supported by an operation + * This function is HexNN specific, use case is when registration library is not in use. + * Optional function. + * + * @param[in] operationType Operation type + * @param[in] numOfStaticParams Number of static params defined by the op + * @param[in] staticParams Array of static params to the op + * @return Error code, indicating if the operation can be created on this set of configuration or not. + * + */ + +SnpeUdo_ErrorType_t +SnpeUdo_validateOperation (SnpeUdo_String_t operationType, + uint32_t numOfStaticParams, + const SnpeUdo_Param_t* staticParams); + +typedef SnpeUdo_ErrorType_t (*SnpeUdo_ValidateOperationFunction_t) (SnpeUdo_String_t, + uint32_t, + const SnpeUdo_Param_t*); + +typedef SnpeUdo_ValidateOperationFunction_t Udo_ValidateOperationFunction_t; + +// enum used for indicating input/outout tensor data layouts on DSP, plain vs d32 +typedef enum { + SNPE_UDO_DSP_TENSOR_LAYOUT_PLAIN = 0x00, UDO_DSP_TENSOR_LAYOUT_PLAIN = 0x00, + SNPE_UDO_DSP_TENSOR_LAYOUT_D32 = 0x01, UDO_DSP_TENSOR_LAYOUT_D32 = 0x01 +} SnpeUdo_HexNNTensorLayout_t; + +typedef SnpeUdo_HexNNTensorLayout_t Udo_HexNNTensorLayout_t; + +/** + * @brief A function to query numbers of inputs and outputs, + * quantization type of each input and each output as arrays, + * and data layout (plain vs d32) of each input and each output as arrays + * of an operation. + * inputsQuantTypes and inputsLayouts should point to arrays of size numOfInputs + * outputsQuantTypes and outputsLayouts should point to arrays of size numOfOutputs + * + * Note: inputsLayouts and inputsLayouts can point to NULL, in this case, it is + * assumed all inputs and/or outputs have plain data layouts, i.e. no D32 + * + * @param[in] operationType Operation type + * @param[in] numOfStaticParams Number of static params defined by the op + * @param[in] staticParams Array of static params to the op + * @param[in,out] numOfInputs Number of input tensors to the op + * @param[in,out] inputsQuantTypes Array of Quantization info for each input tensor + * @param[in,out] inputsLayouts Array of layout type for each input tensor + * @param[in,out] numOfOutputs Number of output tensors to the op + * @param[in,out] outputsQuantTypes Array of Quantization info for each output tensor + * @param[in,out] outputsLayouts Array of layout type for each output tensor + * @return error code, indicating status of query + */ + +SnpeUdo_ErrorType_t +SnpeUdo_queryOperation (SnpeUdo_String_t operationType, + uint32_t numOfStaticParams, + const SnpeUdo_Param_t* staticParams, + uint32_t* numOfInputs, + SnpeUdo_QuantizationType_t** inputsQuantTypes, + SnpeUdo_HexNNTensorLayout_t** inputsLayouts, + uint32_t* numOfOutputs, + SnpeUdo_QuantizationType_t** outputsQuantTypes, + SnpeUdo_HexNNTensorLayout_t** outputsLayouts); + +typedef SnpeUdo_ErrorType_t (*SnpeUdo_QueryOperationFunction_t) (SnpeUdo_String_t, + uint32_t, + const SnpeUdo_Param_t*, + uint32_t*, + SnpeUdo_QuantizationType_t**, + SnpeUdo_HexNNTensorLayout_t**, + uint32_t*, + SnpeUdo_QuantizationType_t**, + SnpeUdo_HexNNTensorLayout_t**); + +typedef SnpeUdo_QueryOperationFunction_t Udo_QueryOperationFunction_t; + +// Global infrastructure functions supported by Hexagon-NN v2 +typedef void (*workerThread_t) (void* perOpInfrastructure, void* userData); +typedef int (*udoSetOutputTensorSize_t) (void* perOpInfrastructure, uint32_t outIdx, uint32_t size); +typedef int (*udoGetInputD32Paddings_t) (void* perOpInfrastructure, uint32_t inIdx, + uint32_t* heightPadBefore, uint32_t* heightPadAfter, + uint32_t* widthPadBefore, uint32_t* widthPadAfter, + uint32_t* depthPadBefore, uint32_t* depthPadAfter); +typedef int (*udoSetOutputD32ShapeSizePaddings_t) (void* perOpInfrastructure, uint32_t outIdx, + uint32_t batch, + uint32_t height, uint32_t heightPadBefore, uint32_t heightPadAfter, + uint32_t width, uint32_t widthPadBefore, uint32_t widthPadAfter, + uint32_t depth, uint32_t depthPadBefore, uint32_t depthPadAfter, + SnpeUdo_DataType_t dataType); +typedef void* (*udoMemalign_t) (size_t n, size_t size); +typedef void* (*udoMalloc_t) (size_t size); +typedef void* (*udoCalloc_t) (size_t n, size_t size); +typedef void (*udoFree_t) (void* ptr); +typedef uint32_t (*udoGetVtcmSize_t) (void* perOpInfrastructure); +typedef void* (*udoGetVtcmPtr_t) (void* perOpInfrastructure); +typedef uint32_t (*udoVtcmIsReal_t) (void* perOpInfrastructure); +typedef void (*udoRunWorkerThreads_t) (void* perOpInfrastructure, uint32_t nThreads, workerThread_t w, void* userData); + +typedef struct hexNNv2GlobalInfra { + udoSetOutputTensorSize_t udoSetOutputTensorSize; + udoGetInputD32Paddings_t udoGetInputD32Paddings; + udoSetOutputD32ShapeSizePaddings_t udoSetOutputD32ShapeSizePaddings; + udoMemalign_t udoMemalign; + udoMalloc_t udoMalloc; + udoCalloc_t udoCalloc; + udoFree_t udoFree; + udoGetVtcmSize_t udoGetVtcmSize; + udoGetVtcmPtr_t udoGetVtcmPtr; + udoVtcmIsReal_t udoVtcmIsReal; + udoRunWorkerThreads_t udoRunWorkerThreads; +} SnpeUdo_HexNNv2GlobalInfra_t; + +typedef SnpeUdo_HexNNv2GlobalInfra_t Udo_HexNNv2GlobalInfra_t; + +// hexnn types +typedef enum hexnnInfraType { + UDO_INFRA_HEXNN_V2, + UDO_INFRA_HEXNN_V3 // reserved, do not use +} SnpeUdo_HexNNInfraType_t; + +typedef SnpeUdo_HexNNInfraType_t Udo_HexNNInfraType_t; + +typedef struct { + Udo_CreateOpFactoryFunction_t create_op_factory; + Udo_CreateOperationFunction_t create_operation; + Udo_ExecuteOpFunction_t execute_op; + Udo_ReleaseOpFunction_t release_op; + Udo_ReleaseOpFactoryFunction_t release_op_factory; + Udo_ValidateOperationFunction_t validate_op; + Udo_QueryOperationFunction_t query_op; +} udo_func_package_t; + +/** + * @brief Infrastructures needed by a developer of DSP Hexnn UDO Implementation library. + * + * The framework/runtime which loads the Hexnn UDO implementation library provides + * this infrastructure to the loaded library by calling "SnpeUdo_initImplLibrary" + * function, and passing it (cast to void*). The Hexnn UDO library is expected + * to cast it back to this structure. + * + */ +typedef struct dspGlobalInfrastructure { + SnpeUdo_Version_t dspInfraVersion; // api version + SnpeUdo_HexNNInfraType_t infraType; + SnpeUdo_HexNNv2GlobalInfra_t hexNNv2Infra; +} SnpeUdo_DspGlobalInfrastructure_t; + +typedef SnpeUdo_DspGlobalInfrastructure_t Udo_DspGlobalInfrastructure_t; + +/** + * hexnn v2 per op factory infrastructure + * + * The framework/runtime passes per op factory infrastructure as a void pointer + * to HexNN UDO implementation library by calling function "SnpeUdo_createOpFactory". + * UDO implementation library is expected to cast it back to this following struct. + * + */ +typedef struct hexnnv2OpFactoryInfra { + unsigned long graphId; +} SnpeUdo_HexNNv2OpFactoryInfra_t; + +typedef SnpeUdo_HexNNv2OpFactoryInfra_t Udo_HexNNv2OpFactoryInfra_t; + +/** + * hexnn v2 per operation infrastructure + * + * The framework/runtime passes per operation infrastructure as a void pointer + * to HexNN UDO implementation library by calling function "SnpeUdo_createOperation". + * UDO implementation library is expected to cast it to the following type and save it. + * + * This is needed to be passed back into some functions from global infrastructure. + * + */ +typedef void* SnpeUdo_HexNNv2OpInfra_t; + +typedef SnpeUdo_HexNNv2OpInfra_t Udo_HexNNv2OpInfra_t; + +/** @} */ /* end_addtogroup c_plus_plus_apis C++ */ + +#endif // SNPE_UDO_IMPL_DSP_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h new file mode 100755 index 0000000000000..8c17c1d5b35f1 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h @@ -0,0 +1,48 @@ +//============================================================================== +// +// Copyright (c) 2019-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef SNPE_UDO_SHARED_H +#define SNPE_UDO_SHARED_H + +#include "DSP/Udo/UdoBase.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** @addtogroup c_plus_plus_apis C++ +@{ */ + +/** + * @brief A function to return the various versions as they relate to the UDO + * The function returns a struct containing the the following: + * libVersion: the version of the implementation library compiled for the UDO. Set by user + * apiVersion: the version of the UDO API used in compiling the implementation library. + * Set by SNPE + * + * @param[in, out] version A pointer to Version struct of type SnpeUdo_LibVersion_t + * + * @return Error code + * + */ +SnpeUdo_ErrorType_t +SnpeUdo_getVersion (SnpeUdo_LibVersion_t** version); + +typedef SnpeUdo_ErrorType_t +(*SnpeUdo_GetVersionFunction_t) (SnpeUdo_LibVersion_t** version); + +typedef SnpeUdo_GetVersionFunction_t Udo_GetVersionFunction_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +/** @} */ /* end_addtogroup c_plus_plus_apis C++ */ + +#endif // SNPE_UDO_SHARED_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h new file mode 100755 index 0000000000000..d7050c875f6db --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h @@ -0,0 +1,71 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnBackend.h interface. + */ + +#ifndef QNN_GPU_BACKEND_H +#define QNN_GPU_BACKEND_H + +#ifdef __cplusplus +#include +#else +#include +#endif + +#include "QnnBackend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** +* @brief This enum defines QNN GPU custom Backend config options. +*/ +typedef enum { + /// If non-zero, tuning mode will be enabled + QNN_GPU_BACKEND_CONFIG_OPTION_ENABLE_TUNING_MODE = 0, + /// The Performance cache directory. Must be non-null + QNN_GPU_BACKEND_CONFIG_OPTION_PERFORMANCE_CACHE_DIR = 1, + /// If non-zero, the performance cache will be ignored when initializing + QNN_GPU_BACKEND_CONFIG_OPTION_INVALIDATE_PERFORMANCE_CACHE = 2, + /// Unused, present to ensure 32 bits. + QNN_GPU_BACKEND_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF, +} QnnGpuBackend_ConfigOption_t; + +/** + * @brief A struct which defines the QNN GPU Backend custom configuration options. + * Objects of this type are to be referenced through QnnBackend_CustomConfig_t. + */ +typedef struct { + QnnGpuBackend_ConfigOption_t option; + union UNNAMED { + uint8_t enableTuningMode; + const char* performanceCacheDir; + uint8_t invalidatePerformanceCache; + }; +} QnnGpuBackend_CustomConfig_t; + +// clang-format off +/// QnnGpuBackend_CustomConfig_t initializer macro +#define QNN_GPU_BACKEND_CUSTOM_CONFIG_INIT \ + { \ + QNN_GPU_BACKEND_CONFIG_OPTION_UNDEFINED, /*option*/ \ + { \ + false /*enableTuningMode*/ \ + } \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h new file mode 100755 index 0000000000000..8fd9c18afb46b --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h @@ -0,0 +1,49 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines common QNN GPU macros. + */ + +#ifndef QNN_GPU_COMMON_H +#define QNN_GPU_COMMON_H + +#include "QnnCommon.h" + +/// GPU Backend identifier +#define QNN_BACKEND_ID_GPU 4 + +/// GPU interface provider +#define QNN_GPU_INTERFACE_PROVIDER_NAME "GPU_QTI_AISW" + +// GPU API Version values +#define QNN_GPU_API_VERSION_MAJOR 3 +#define QNN_GPU_API_VERSION_MINOR 7 +#define QNN_GPU_API_VERSION_PATCH 0 + +// clang-format off + +/// Macro to set Qnn_ApiVersion_t for GPU backend +#define QNN_GPU_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_GPU_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_GPU_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_GPU_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +#endif // QNN_GPU_COMMON_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h new file mode 100755 index 0000000000000..42599e4280971 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h @@ -0,0 +1,78 @@ +//============================================================================== +// +// Copyright (c) 2021-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnContext.h interface. + */ + +#ifndef QNN_GPU_CONTEXT_H +#define QNN_GPU_CONTEXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief This enum defines QNN GPU custom context config options. + */ +typedef enum { + /// Sets performance hint options via QnnGpuContext_PerfHint_t + QNN_GPU_CONTEXT_CONFIG_OPTION_PERF_HINT = 0, + /// If non-zero, OpenGL buffers will be used + QNN_GPU_CONTEXT_CONFIG_OPTION_USE_GL_BUFFERS = 1, + /// The kernel disk cache directory. Must be non-null + QNN_GPU_CONTEXT_CONFIG_OPTION_KERNEL_REPO_DIR = 2, + /// If non-zero, the kernel disk cache will be ignored when initializing + QNN_GPU_CONTEXT_CONFIG_OPTION_INVALIDATE_KERNEL_REPO = 3, + /// Unused, present to ensure 32 bits. + QNN_GPU_CONTEXT_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF +} QnnGpuContext_ConfigOption_t; + +/** + * @brief An enum which defines the different GPU performance hint options. + */ +typedef enum { + /// Sets the GPU performance hint to high performance, this is the default + QNN_GPU_CONTEXT_PERF_HINT_HIGH = 0, + /// Sets the GPU performance hint to normal performance + QNN_GPU_CONTEXT_PERF_HINT_NORMAL = 1, + /// Sets the GPU performance hint to low performance + QNN_GPU_CONTEXT_PERF_HINT_LOW = 2 +} QnnGpuContext_PerfHint_t; + +/** + * @brief A struct which defines the QNN GPU context custom configuration options. + * Objects of this type are to be referenced through QnnContext_CustomConfig_t. + */ +typedef struct { + QnnGpuContext_ConfigOption_t option; + union UNNAMED { + QnnGpuContext_PerfHint_t perfHint; + uint8_t useGLBuffers; + const char* kernelRepoDir; + uint8_t invalidateKernelRepo; + }; +} QnnGpuContext_CustomConfig_t; + +// clang-format off +/// QnnGpuContext_CustomConfig_t initializer macro +#define QNN_GPU_CONTEXT_CUSTOM_CONFIG_INIT \ + { \ + QNN_GPU_CONTEXT_CONFIG_OPTION_UNDEFINED, /*option*/ \ + { \ + QNN_GPU_CONTEXT_PERF_HINT_HIGH /*perfHint*/ \ + } \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h new file mode 100755 index 0000000000000..e0652d44883ef --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h @@ -0,0 +1,72 @@ +//============================================================================== +// +// Copyright (c) 2020-2021 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnGraph.h interface. + */ + +#ifndef QNN_GPU_GRAPH_H +#define QNN_GPU_GRAPH_H + +#ifdef __cplusplus +#include +#else +#include +#endif + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief An enum which defines the different tensor optimization options. A + * tensor may be optimized to the specified QnnGpu_Precision_t when it + * is a graph tensor that is not a graph input or a graph output and + * does not connect two operations from different op packages. + */ +typedef enum { + /// Sets the precision mode to floating point 32-bit (FP32) + QNN_GPU_PRECISION_FP32 = 0, + /// Sets the precision mode to floating point 16-bit (FP16) + QNN_GPU_PRECISION_FP16 = 1, + /// Sets the precision mode to FP16 for storage and FP32 for calculations + QNN_GPU_PRECISION_HYBRID = 2, + /// Uses the tensor data type provided by the user (default) + QNN_GPU_PRECISION_USER_PROVIDED = 3, +} QnnGpu_Precision_t; + +/** + * @brief A struct which defines the QNN GPU graph custom configuration options. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + */ +typedef struct { + QnnGpu_Precision_t precision; + uint8_t disableMemoryOptimizations; + uint8_t disableNodeOptimizations; + uint8_t disableQueueRecording; +} QnnGpuGraph_CustomConfig_t; + +// clang-format off +/// QnnGpuGraph_CustomConfig_t initializer macro +#define QNN_GPU_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_GPU_PRECISION_USER_PROVIDED, /*precision*/ \ + 0u, /*disableMemoryOptimizations*/ \ + 0u, /*disableNodeOptimizations*/ \ + 0u /*disableQueueRecording*/ \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h new file mode 100755 index 0000000000000..1c6cd5c3e032a --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h @@ -0,0 +1,52 @@ +//============================================================================== +// +// Copyright (c) 2024 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnMem.h interface. + */ + +#ifndef QNN_GPU_MEM_H +#define QNN_GPU_MEM_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* QnnGpuMem_Buffer_t; + +/** + * @brief This enum defines QNN GPU memory type + */ +typedef enum { QNN_GPU_MEM_OPENCL = 0, QNN_GPU_MEM_UNDEFINED = 0x7FFFFFF } QnnGpu_MemType_t; + +/** + * @brief A struct which defines the QNN GPU memory preallocated by the client. + * Objects of this type are to be referenced through Qnn_MemInfoCustom_t. + */ +typedef struct { + QnnGpu_MemType_t memType; + union { + QnnGpuMem_Buffer_t buffer; + }; +} QnnGpu_MemInfoCustom_t; + +// clang-format off +/// QnnGpu_MemInfoCustom_t initializer macro +#define QNN_GPU_MEMINFO_CUSTOM_INIT \ + { \ + QNN_GPU_MEM_UNDEFINED, /*memType*/ \ + NULL /* buffer*/ \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h new file mode 100755 index 0000000000000..5413f50ba2267 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h @@ -0,0 +1,682 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief A header which defines the QNN GPU specialization of the QnnOpPackage.h interface. + */ + +#ifndef QNN_GPU_OP_PACKAGE_H +#define QNN_GPU_OP_PACKAGE_H + +#ifdef __cplusplus +#include +#else +#include +#endif + +#include "GPU/QnnGpuCommon.h" +#include "GPU/QnnGpuGraph.h" +#include "QnnOpPackage.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// QnnOpPackage_GlobalInfrastructure_t specialization. +//============================================================================= + +/** + * @brief A struct which is used to communicate device constant properties + */ +typedef struct { + /// GPU device version string + char deviceVersion[128]; + /// GPU driver interface version {major, minor} + uint32_t interfaceVersion[2]; + /// GPU Adreno(TM) tier string + char tierName[8]; + /// GPU driver version {product, major, minor, patch} + uint32_t compilerVersion[4]; + /// GPU device max work group size + size_t maxWorkGroupSize; + /// GPU device image 2D max width + size_t image2dMaxWidth; + /// GPU device image 2D max height + size_t image2dMaxHeight; + /// GPU device max memory allocation size + size_t maxBufferAllocSize; + /// GPU device addr alignment in bits + uint32_t baseAddrAlignment; + /// GPU device image 2D Array max width + size_t image2dArrayMaxWidth; + /// GPU device image 2D Array max height + size_t image2dArrayMaxHeight; + /// GPU device image 2D Array max depth + size_t image2dArrayMaxDepth; +} QnnGpu_DeviceProperties_t; + +/** + * @brief A QNN GPU struct specializing QnnOpPackage_GlobalInfrastructure_t + */ +typedef struct _QnnOpPackage_GlobalInfrastructure_t { + /// GPU backend version (as returned by QnnBackend_getApiVersion()) + const Qnn_ApiVersion_t* sdkApiVersion; + /// GPU device properties + const QnnGpu_DeviceProperties_t* deviceProperties; + /// Null terminated path to the OpenCL driver used by the backend + const char* driverPath; +} QnnGpuOpPackage_GlobalInfrastructure_t; + +//============================================================================= +// QnnOpPackage_PackageInfo_t specialization. +//============================================================================= + +/** + * @brief A struct having op package specific information + */ +typedef struct _QnnOpPackage_PackageInfo_t { + /// Null terminated hash key string of all kernel sources + const char* kernelRepoHash; +} QnnGpuOpPackage_PackageInfo_t; + +//============================================================================= +// QnnOpPackage_Optimization_t specialization. +//============================================================================= + +/** + * @brief An enum to specify the QNN GPU optimization type + * + */ +typedef enum { + /// Undefined option only used for QNN_GPU_OP_PACKAGE_OPTIMIZATION_INIT + QNN_GPU_OPTIMIZATION_TYPE_UNDEFINED = 0, + /// Super node optimization + QNN_GPU_OPTIMIZATION_TYPE_SUPER_NODE = 2, +} QnnGpuOpPackage_OptimizationType_t; + +/** + * @brief A struct representing a super node connection constraint. + */ +typedef struct { + /// Producer node corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations + uint32_t producer; + /// Output tensor index corresponding to the producer node + uint32_t producerOutputIndex; + /// Consumer node corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations + uint32_t consumer; + /// Output tensor index corresponding to the consumer node + uint32_t consumerInputIndex; +} QnnGpuOpPackage_SuperNodeConnectionConstraint_t; + +/** + * @brief An enum to specify the source of a tensor in an op def for a tensor constraint. + * + */ +typedef enum { + /// Tensor is an op def output + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_SOURCE_OUTPUT = 1, + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_SOURCE_INPUT = 2, +} QnnGpuOpPackage_TensorConstraintSource_t; + +/** + * @brief An enum to specify the tensor constraint type. + * + */ +typedef enum { + /// Add a Qnn_DataType_t to the whitelist of allowable types. + /// If no data type constraint is present for a tensor, all data types are allowed. + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_DATA_TYPE = 1, + /// Tensor must match it's rank + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_RANK = 2, + /// Tensor must match one of it's dimensions + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_DIMENSION = 3, + /// Add a Qnn_TensorType_t to the whitelist of allowable tensor types. + /// If no tensor type constraint is present for a tensor, all types are allowed. + QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_TENSOR_TYPE = 4, +} QnnGpuOpPackage_TensorConstraintType_t; + +/** + * @brief A struct representing a tensor constraint. + */ +typedef struct { + /// Operation corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations + uint32_t operationIndex; + /// Source of the tensor in the Qnn_OpConfig_t + QnnGpuOpPackage_TensorConstraintSource_t source; + union { + /// Tensor index in the Qnn_OpConfig_t, used only for inputs and outputs + uint32_t index; + /// Tensor parameter name in the Qnn_OpConfig_t, used only for parameters + const char* name; + }; + /// Type of tensor constraint + QnnGpuOpPackage_TensorConstraintType_t type; + union { + /// Tensor data type for Qnn_DataType_t constraints + Qnn_DataType_t dataType; + /// Tensor type for Qnn_TensorType_t constraints + Qnn_TensorType_t tensorType; + /// Tensor rank for rank constraints + uint32_t rank; + struct { + /// Tensor dimension index for dimension constraints + uint32_t index; + /// Tensor dimension size for dimension constraints + uint32_t size; + } dimension; + }; +} QnnGpuOpPackage_TensorConstraint_t; + +typedef struct { + /// Null-terminated array of comma separated lists of operations used for matching super node ops. + /// An asterisk (*) may be used to represent any operation type. + const char** operations; + /// Null-terminated array of pointers to super node connection constraints + QnnGpuOpPackage_SuperNodeConnectionConstraint_t** connectionConstraints; + /// Null-terminated array of pointers to super node tensor constraints + QnnGpuOpPackage_TensorConstraint_t** tensorConstraints; +} QnnGpuOpPackage_SuperNodeOptimization_t; + +// clang-format off +/// QnnGpuOpPackage_SuperNodeOptimization_t initializer macro +#define QNN_GPU_OP_PACKAGE_SUPER_NODE_OPTIMIZATION_INIT \ + { \ + NULL, /*operations*/ \ + NULL, /*connectionConstraints*/ \ + NULL, /*tensorConstraints*/ \ + } +// clang-format on + +/** + * @brief A struct representing a QNN GPU optimization. + */ +typedef struct _QnnOpPackage_Optimization_t { + /// Type of optimization + QnnGpuOpPackage_OptimizationType_t type; + /// Op package assigned name of the optimization + const char* name; + union { + /// Super node optimization, used when type is QNN_GPU_OPTIMIZATION_TYPE_SUPER_NODE + const QnnGpuOpPackage_SuperNodeOptimization_t* superNode; + }; +} QnnGpuOpPackage_Optimization_t; + +/// QnnGpuOpPackage_Optimization_t initializer macro +#define QNN_GPU_OP_PACKAGE_OPTIMIZATION_INIT \ + { \ + QNN_GPU_OPTIMIZATION_TYPE_UNDEFINED, NULL, { NULL } \ + } + +//============================================================================= +// QnnOpPackage_GraphInfrastructure_t specialization. +//============================================================================= + +/** + * @brief A QNN GPU struct specializing QnnOpPackage_GraphInfrastructure_t + */ +typedef struct _QnnOpPackage_GraphInfrastructure_t { + /// GPU precision mode, user-supplied hint used for optimal kernel selection + QnnGpu_Precision_t precisionMode; +} QnnGpuOpPackage_GraphInfrastructure_t; + +//============================================================================= +// QNN GPU Memory Object +//============================================================================= + +/** + * @brief An enum to specify the QNN GPU memory object type + * + */ +typedef enum { + /// Host memory, only used for Qnn_Param_t tensors + QNN_GPU_MEM_OBJ_TYPE_HOST = 0, + /// GPU driver buffer memory object + QNN_GPU_MEM_OBJ_TYPE_BUFFER = 1, + /// GPU driver image 2D memory object + QNN_GPU_MEM_OBJ_TYPE_IMAGE2D = 2, + /// GPU driver image 2D array memory object + QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY = 3, + /// Aggregation of GPU driver image 2D memory objects + QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D = 4, + /// Aggregation of GPU driver image 2D array memory objects + QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY = 5, + /// Memory type is unclaimed and can be specified by the op package via the \n + /// QnnGpu_OutputClaim_t struct + QNN_GPU_MEM_OBJ_TYPE_UNCLAIMED = 6, +} QnnGpu_MemoryObjectType_t; + +/** + * @brief An enum to specify the QNN GPU memory layout + * + */ +typedef enum { + /// HWC layout + QNN_GPU_MEM_LAYOUT_HWC = 0, + /// HCW layout + QNN_GPU_MEM_LAYOUT_HCW = 1, + /// CHW layout + QNN_GPU_MEM_LAYOUT_CHW = 2, + /// Undefined + QNN_GPU_MEM_LAYOUT_UNDEFINED = 0x7FFFFFFF, +} QnnGpu_MemoryLayout_t; + +/** + * @brief A struct to specify blockSize for weight Tensor and tensorId for weight Param tensor + */ +typedef struct { + // Block Quantization, block Sizes + uint32_t* bqBlockSize; + /// Tensor Id for Quantization encodings + uint32_t bqEncodingTensorId; +} QnnGpu_BlockEncodingInfo_t; + +// clang-format off +/// QnnGpu_MemoryObject_t initializer macro +#define QNN_GPU_BLOCK_ENCODING_INFO_INIT \ + { \ + NULL, /*bqBlockSize*/ \ + 0u /*bqEncodingTensorId*/ \ + } +// clang-format on + +/** + * @brief A QNN GPU struct specifying a memory object + * This struct is used with the following kernel argument types: + * - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ + * - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE + * - QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE + */ +typedef struct { + /// Type of memory object + QnnGpu_MemoryObjectType_t type; + /// Data type of the memory object + Qnn_DataType_t dataType; + /// Memory object dimensions \n + /// Size is numDimensions. Uses the following type dependent format: \n + /// QNN_GPU_MEM_OBJ_TYPE_BUFFER -> {numElements} \n + /// QNN_GPU_MEM_OBJ_TYPE_IMAGE2D -> {height,width} \n + /// QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY -> {height,width,array_size} \n + /// QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D -> {num_batches,height,width} \n + /// QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY -> {num_batches,height,width,array_size} + uint32_t* dimensions; + /// Memory object offsets \n + /// Size is numDimensions. \n + /// Indicates where the data store starts in the memory object. \n + uint32_t* offsets; + /// Number of dimensions in memory object \n + /// Size is numDimensions. Has the following type dependent size: \n + /// QNN_GPU_MEM_OBJ_TYPE_BUFFER -> 1 \n + /// QNN_GPU_MEM_OBJ_TYPE_IMAGE2D -> 2 \n + /// QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY -> 3 \n + /// QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D -> 3 \n + /// QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY -> 4 + uint32_t numDimensions; + /// Memory object layout \n + /// Op package specific layout identifier \n + /// Default is QNN_GPU_MEM_LAYOUT_UNDEFINED if not already specified by a prior operation + QnnGpu_MemoryLayout_t layout; + /// Block Quantization Tensor Information + QnnGpu_BlockEncodingInfo_t blockEncodingInfo; +} QnnGpu_MemoryObject_t; + +// clang-format off +/// QnnGpu_MemoryObject_t initializer macro +#define QNN_GPU_MEMORY_OBJECT_INIT \ + { \ + QNN_GPU_MEM_OBJ_TYPE_UNCLAIMED, /*type*/ \ + QNN_DATATYPE_UNDEFINED, /*dataType*/ \ + NULL, /*dimensions*/ \ + NULL, /*offsets*/ \ + 0u, /*numDimensions*/ \ + QNN_GPU_MEM_LAYOUT_UNDEFINED, /*layout*/ \ + QNN_GPU_BLOCK_ENCODING_INFO_INIT /*blockEncodingInfo*/ \ + } +// clang-format on + +//============================================================================= +// QnnOpPackage_Node_t specialization. +//============================================================================= + +/** + * @brief A QNN GPU struct specifying a storage tensor + */ +typedef struct { + /// Tensor ID + uint32_t id; + /// Tensor's associated memory object + const QnnGpu_MemoryObject_t* memoryObject; +} QnnGpu_TensorStorageType_t; + +// clang-format off +/// QnnGpu_TensorStorageType_t initializer macro +#define QNN_GPU_TENSOR_STORAGE_TYPE_INIT \ + { \ + 0u, /*id*/ \ + NULL /*memoryObject*/ \ + } +// clang-format on + +/** + * @brief A QNN GPU struct specializing QnnOpPackage_Node_t + */ +typedef struct _QnnOpPackage_Node_t { + /// Optimization index, see QnnOpPackage_Info_t, ignore when only one op config provided + uint32_t optimization; + /// Null-terminated array of operation config pointers + /// Only one pointer provided when no optimizations performed + const Qnn_OpConfig_t** configs; + /// Null-terminated array of tensor storage type pointers called out in the config + const QnnGpu_TensorStorageType_t** storageTypes; + /// Kernel variant index, if set then used by OpPackage to determine kernel selection + int32_t kernelVariant; +} QnnGpuOpPackage_Node_t; + +//============================================================================= +// QnnOpPackage_OpImpl_t specialization. +//============================================================================= + +/** + * @brief A QNN GPU struct specifying an output tensor claim. Using the principle + * of least work, operations must output a memory object type that is most + * convenient for itself. Only QNN_TENSOR_TYPE_NATIVE tensor types may + * be claimed. + */ +typedef struct { + /// Index into the Qnn_OpConfig_t provided in QnnGpuOpPackage_Node_t + uint32_t opConfigIndex; + /// Index into the operation outputs to identify the tensor + uint32_t outputIndex; + /// Specification of the claimed memory object + const QnnGpu_MemoryObject_t* memoryObject; +} QnnGpu_OutputClaim_t; + +// clang-format off +/// QnnGpu_OutputClaim_t initializer macro +#define QNN_GPU_OUTPUT_CLAIM_INIT \ + { \ + 0u, /*opConfigIndex*/ \ + 0u, /*outputIndex*/ \ + NULL /*memoryObject*/ \ + } +// clang-format on + +/** + * @brief An enum to specify the kernel argument type. + * + */ +typedef enum { + /// Operation input tensor used as kernel input + QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ = 0, + /// Operation input tensor used as kernel output + QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE = 1, + /// Operation output tensor used as kernel output + QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE = 2, + /// Operation internal tensor used as kernel input + QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ = 3, + /// Operation internal tensor used as kernel input/output + QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE = 4, + /// Operation internal tensor used as kernel output + QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE = 5, + /// Plain old data kernel argument + QNN_GPU_KERNEL_ARG_TYPE_DATA = 6, + /// Local memory kernel argument + QNN_GPU_KERNEL_ARG_TYPE_LOCAL = 7, + /// Null pointer kernel argument + QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR = 8, + /// Operation tensor parameter used as kernel input + QNN_GPU_KERNEL_ARG_TYPE_OP_TENSOR_PARAM = 9, +} QnnGpu_KernelArgType_t; + +/** + * @brief A QNN GPU struct specifying a kernel argument corresponding to a tensor. + * This struct is used with the following kernel argument types: + * - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ + * - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE + * - QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE + * - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE + */ +typedef struct { + /// Index into the Qnn_OpConfig_t provided in QnnGpuOpPackage_Node_t, ignored for INTERNAL types + uint32_t opConfigIndex; + /// Index into the operation input ot output list or the internal tensor list + uint32_t tensorIndex; + /// Batch element index for aggregated tensor types + uint32_t element; +} QnnGpu_TensorKernelArg_t; + +// clang-format off +/// QnnGpu_TensorKernelArg_t initializer macro +#define QNN_GPU_TENSOR_KERNEL_ARG_INIT \ + { \ + 0u, /*opConfigIndex*/ \ + 0u, /*tensorIndex*/ \ + 0u /*element*/ \ + } +// clang-format on + +/** + * @brief An enum to specify the kernel data argument type. + * + */ +typedef enum { + QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR = 0, + QNN_GPU_KERNEL_ARG_CL_TYPE_UCHAR = 1, + QNN_GPU_KERNEL_ARG_CL_TYPE_SHORT = 2, + QNN_GPU_KERNEL_ARG_CL_TYPE_USHORT = 3, + QNN_GPU_KERNEL_ARG_CL_TYPE_INT = 4, + QNN_GPU_KERNEL_ARG_CL_TYPE_UINT = 5, + QNN_GPU_KERNEL_ARG_CL_TYPE_LONG = 6, + QNN_GPU_KERNEL_ARG_CL_TYPE_ULONG = 7, + QNN_GPU_KERNEL_ARG_CL_TYPE_FLOAT = 8, + QNN_GPU_KERNEL_ARG_CL_TYPE_DOUBLE = 9, +} QnnGpu_DataKernelArgType_t; + +/** + * @brief A QNN GPU struct specifying a kernel argument corresponding to a plain old data. + * This struct is used only with the QNN_GPU_KERNEL_ARG_TYPE_DATA arg type. + */ +typedef struct { + /// Data type of the data + QnnGpu_DataKernelArgType_t type; + union { + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR + int8_t qnnChar; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_UCHAR + uint8_t qnnUChar; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_SHORT + int16_t qnnShort; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_USHORT + uint16_t qnnUShort; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_INT + int32_t qnnInt; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_UINT + uint32_t qnnUInt; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_LONG + int64_t qnnLong; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_ULONG + uint64_t qnnULong; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_FLOAT + float qnnFloat; + /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_DOUBLE + double qnnDouble; + }; +} QnnGpu_DataKernelArg_t; + +/// QnnGpu_DataKernelArg_t initializer macro +#define QNN_GPU_DATA_KERNEL_ARG_INIT \ + { \ + QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR, /*type*/ \ + { \ + 0 /*qnnChar*/ \ + } \ + } + +/** + * @brief A QNN GPU struct specifying a kernel argument corresponding to a local memory type. + * This struct is used only with the QNN_GPU_KERNEL_ARG_TYPE_LOCAL arg type. + */ +typedef struct { + /// Size of the memory requested in bytes + uint32_t size; +} QnnGpu_LocalKernelArg_t; + +/// QnnGpu_LocalKernelArg_t initializer macro +#define QNN_GPU_LOCAL_KERNEL_ARG_INIT \ + { 0u /*size*/ } + +/** + * @brief A QNN GPU struct specifying a kernel argument. + * Note that the QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR type does not have an entry in + * the union. + */ +typedef struct { + /// Type of kernel argument + QnnGpu_KernelArgType_t type; + union { + /// Tensor type argument + QnnGpu_TensorKernelArg_t tensor; + /// Plain old data argument + QnnGpu_DataKernelArg_t data; + /// Local memory argument + QnnGpu_LocalKernelArg_t local; + }; +} QnnGpu_KernelArg_t; + +/// QnnGpu_KernelArg_t initializer macro +#define QNN_GPU_KERNEL_ARG_INIT \ + { \ + QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR, /*type*/ \ + { \ + QNN_GPU_TENSOR_KERNEL_ARG_INIT /*tensor*/ \ + } \ + } + +/** + * @brief An enum to specify the kernel source type. + * + */ +typedef enum { + QNN_GPU_KERNEL_SOURCE_TYPE_TEXT = 0, + QNN_GPU_KERNEL_SOURCE_TYPE_BINARY = 1, +} QnnGpu_KernelSourceType_t; + +/** + * @brief This enum defines QNN GPU kernel tuning options. + */ +typedef enum { + /// local work size tuning + QNN_GPU_KERNEL_TUNING_LOCAL_WORK_SIZE = 0, + QNN_GPU_KERNEL_TUNING_UNDEFINED = 0x7FFFFFFF +} QnnGpu_KernelTuningOption_t; + +/** + * @brief This struct provides local-work-size tuning configuration. + */ +typedef struct { + uint32_t minValue[3]; + uint32_t maxValue[3]; + uint32_t stepSize[3]; +} QnnGpu_KernelLocalWorkSizeTuning_t; + +/** + * @brief This struct provides QNN GPU kernel tuning configuration. + */ +typedef struct { + QnnGpu_KernelTuningOption_t option; + union UNNAMED { + QnnGpu_KernelLocalWorkSizeTuning_t lws; + }; +} QnnGpu_KernelTuningConfig_t; + +/** + * @brief A QNN GPU struct specifying a kernel. + */ +typedef struct { + /// Kernel source code or binary + const void* kernelSource; + /// Length of kernel source/binary in bytes + size_t sourceLength; + /// Type of kernel source + QnnGpu_KernelSourceType_t sourceType; + /// Null terminated build options string used for kernel compilation + const char* buildOptions; + /// Rank of the globalWorkSizes + size_t globalWorkDim; + /// Global work sizes used by enqueuing the kernel + size_t globalWorkSizes[3]; + /// Rank of the localWorkSizes + size_t localWorkDim; + /// Local work sizes used by enqueuing the kernel + size_t localWorkSizes[3]; + /// Null-terminated array of kernel arguments in the order they appear in the kernel function + QnnGpu_KernelArg_t** args; + /// Null terminated name of the kernel + const char* name; + /// If non-zero, kernel will be enqueued during execute even if it is static + uint32_t isDynamic; + /// Null-terminated array to provide kernel tuning configurations. + QnnGpu_KernelTuningConfig_t** tuningConfigs; + /// Reserved field, must be null + void* reserved; +} QnnGpu_Kernel_t; + +// clang-format off +/// QnnGpu_Kernel_t initializer macro +#define QNN_GPU_KERNEL_INIT \ + { \ + NULL, /*kernelSource*/ \ + 0u, /*sourceLength*/ \ + QNN_GPU_KERNEL_SOURCE_TYPE_TEXT, /*sourceType*/ \ + NULL, /*buildOptions*/ \ + 0u, /*globalWorkDim*/ \ + {0u}, /*globalWorkSizes*/ \ + 0u, /*localWorkDim*/ \ + {0u}, /*localWorkSizes*/ \ + NULL, /*args*/ \ + NULL, /*name*/ \ + 0u, /*isDynamic*/ \ + NULL, /*tuningConfigs*/ \ + NULL /*reserved*/ \ + } +// clang-format on + +/** + * @brief A QNN GPU struct specifying an operation. + */ +typedef struct _QnnOpPackage_OpImpl_t { + /// Null-terminated array of output claims + QnnGpu_OutputClaim_t** outputClaims; + /// Null-terminated array of tensor requests + QnnGpu_MemoryObject_t** memoryObjects; + /// Null-terminated array of kernels + QnnGpu_Kernel_t** kernels; +} QnnGpu_Operation_t; + +// clang-format off +/// QnnGpu_Operation_t initializer macro +#define QNN_GPU_OPERATION_INIT \ + { \ + NULL, /*outputClaims*/ \ + NULL, /*memoryObjects*/ \ + NULL, /*kernels*/ \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h new file mode 100755 index 0000000000000..3adb43819b8b3 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h @@ -0,0 +1,50 @@ +//============================================================================= +// +// Copyright (c) 2024 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN GenAiTransformer Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for GenAiTransformer backend + */ + +#ifndef QNN_GENAI_TRANSFORMER_COMMON_H +#define QNN_GENAI_TRANSFORMER_COMMON_H + +#include "QnnCommon.h" + +/// GenAiTransformer Backend identifier +#define QNN_BACKEND_ID_GENAI_TRANSFORMER 14 + +/// GenAiTransformer interface provider +#define QNN_GENAI_TRANSFORMER_INTERFACE_PROVIDER_NAME "GENAI_TRANSFORMER_QTI_AISW" + +// GenAiTransformer API Version values +#define QNN_GENAI_TRANSFORMER_API_VERSION_MAJOR 1 +#define QNN_GENAI_TRANSFORMER_API_VERSION_MINOR 0 +#define QNN_GENAI_TRANSFORMER_API_VERSION_PATCH 0 + +// clang-format off +/// Macro to set Qnn_ApiVersion_t for GENAI_TRANSFORMER backend +#define QNN_GENAI_TRANSFORMER_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_GENAI_TRANSFORMER_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_GENAI_TRANSFORMER_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_GENAI_TRANSFORMER_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +#endif // QNN_GENAI_TRANSFORMER_COMMON_H \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h new file mode 100755 index 0000000000000..e756b8042ec09 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h @@ -0,0 +1,76 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTA component Backend API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnBackend.h for HTA backend + */ + +#ifndef QNN_HTA_BACKEND_H +#define QNN_HTA_BACKEND_H + +#include "QnnBackend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/* @brief Enum describing the set of features supported by HTA backend. + This is used as a bitmask, so assign unique bits to each entries. +*/ +typedef enum { + /// The accelerator will always attempt to fold relu activation + /// into the immediate preceding convolution operation. This optimization + /// is correct when quantization ranges for convolution are equal or + /// subset of the Relu operation. For graphs, where this cannot be + /// guranteed, the client should set this flag + QNN_HTA_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 1 << 0, + /// UNKNOWN enum event that must not be used + QNN_HTA_BACKEND_FEATURES_UNKNOWN = 0x7fffffff +} QnnHtaBackend_Features_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/** + * @brief Structure describing the set of configurations supported by the backend. + * Objects of this type are to be referenced through QnnBackend_CustomConfig_t. + */ +typedef struct { + /// field to save the features that are passed + /// via QnnHtaBackend_Features_t + uint32_t bitmaskFeatures; +} QnnHtaBackend_CustomConfig_t ; + +/// QnnHtaBackend_CustomConfig_t initializer macro +#define QNN_HTA_BACKEND_CUSTOM_CONFIG_INIT \ + { 0 /*bitmaskFeatures*/ } + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h new file mode 100755 index 0000000000000..1eb8e1f0a99a4 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h @@ -0,0 +1,62 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTA Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for HTA backend + */ + +#ifndef QNN_HTA_COMMON_H +#define QNN_HTA_COMMON_H + +#include "QnnCommon.h" + +/// HTA Backend identifier +#define QNN_BACKEND_ID_HTA 7 + +/// HTA interface provider +#define QNN_HTA_INTERFACE_PROVIDER_NAME "HTA_QTI_AISW" + +// HTA API Version values + +#define QNN_HTA_API_VERSION_MAJOR 2 +#define QNN_HTA_API_VERSION_MINOR 0 +#define QNN_HTA_API_VERSION_PATCH 0 + +// clang-format off + +/// Macro to set Qnn_ApiVersion_t for HTA backend +#define QNN_HTA_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_HTA_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_HTA_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_HTA_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +// HTA Binary Version values +#define QNN_HTA_BINARY_VERSION_MAJOR 2 +#define QNN_HTA_BINARY_VERSION_MINOR 0 +#define QNN_HTA_BINARY_VERSION_PATCH 0 + +// HTA Context blob Version values +#define QNN_HTA_CONTEXT_BLOB_VERSION_MAJOR 1 +#define QNN_HTA_CONTEXT_BLOB_VERSION_MINOR 1 +#define QNN_HTA_CONTEXT_BLOB_VERSION_PATCH 0 + +#endif // QNN_HTA_COMMON_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h new file mode 100755 index 0000000000000..d31f5232e21f3 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h @@ -0,0 +1,41 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTA component Device API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnDevice.h for HTA backend + */ +#ifndef QNN_HTA_DEVICE_H +#define QNN_HTA_DEVICE_H + +#include "QnnDevice.h" +#include "QnnHtaPerfInfrastructure.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _QnnDevice_Infrastructure_t { + QnnHtaPerfInfrastructure_SetPowerConfigFn_t setPowerConfig; +} QnnHtaDevice_Infrastructure_t; + +// clang-format off +/// QnnHtaDevice_Infrastructure_t initializer macro +#define QNN_HTA_DEVICE_INFRASTRUCTURE_INIT \ + { \ + NULL, /*setPowerConfig*/ \ + } +// clang-format on + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h new file mode 100755 index 0000000000000..0abbb9bc5114d --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h @@ -0,0 +1,123 @@ +//============================================================================= +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTA component Graph API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnGraph.h for HTA backend + */ + +#ifndef QNN_HTA_GRAPH_H +#define QNN_HTA_GRAPH_H + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different HTA graph optimization + * options that can be used to finalize the graph + * for optimum performance + */ +typedef enum QnnHtaGraph_OptimizationType { + QNN_HTA_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD = 1, + QNN_HTA_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES = 2, + QNN_HTA_GRAPH_OPTIMIZATION_TYPE_UNKNOWN = 0x7fffffff +} QnnHtaGraph_OptimizationType_t; + +/* @brief Struct describing the set of optimization type + * and the value associated with the optimization + */ +typedef struct QnnHtaGraph_OptimizationOption { + QnnHtaGraph_OptimizationType_t type; + float floatValue; +} QnnHtaGraph_OptimizationOption_t; + +// clang-format off +/// QnnHtaGraph_OptimizationOption_t initializer macro +#define QNN_HTA_GRAPH_OPTIMIZATION_OPTION_INIT \ + { \ + QNN_HTA_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/ \ + 0.0f /*floatValue*/ \ + } +// clang-format on + +/** + * @brief This enum provides different HTA graph configuration + * options associated with QnnGraph + */ +typedef enum QnnHtaGraph_ConfigOption { + QNN_HTA_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1, + QNN_HTA_GRAPH_CONFIG_OPTION_PRIORITY = 2, + QNN_HTA_GRAPH_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnHtaGraph_ConfigOption_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/** + * @brief Structure describing the set of configurations supported by graph. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + * + * The struct has two fields - option and a union of corresponding config values + * Based on the option corresponding item in the union can be used to specify + * config + * Below is the Map between QnnHtaGraph_ConfigOption_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+------------------------------------------+------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+==========================================+====================================+ + * | 1 | QNN_HTA_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnHtaGraph_OptimizationOption_t | + * +----+------------------------------------------+------------------------------------+ + * | 2 | QNN_HTA_GRAPH_CONFIG_OPTION_PRIORITY | Qnn_Priority_t | + * +----+------------------------------------------+------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnHtaGraph_ConfigOption_t option; + union { + QnnHtaGraph_OptimizationOption_t optimizationOption; + Qnn_Priority_t priority; + }; +} QnnHtaGraph_CustomConfig_t ; + + +/// QnnHtaGraph_CustomConfig_t initalizer macro +#define QNN_HTA_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_HTA_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + QNN_HTA_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \ + } \ + } + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h new file mode 100755 index 0000000000000..4f6e0c22c274b --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h @@ -0,0 +1,134 @@ +//============================================================================== +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** @file + * @brief QNN HTA component Performance Infrastructure API + * + * Provides interface to the client to control performance and system + * settings of the QNN HTA Accelerator + */ + +#ifndef QNN_HTA_PERF_INFRASTRUCTURE_H +#define QNN_HTA_PERF_INFRASTRUCTURE_H + +#include "QnnCommon.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief QNN HTA PerfInfrastructure API result / error codes. + * + */ +typedef enum { + QNN_HTA_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE, + //////////////////////////////////////////////////////////////////////// + + QNN_HTA_PERF_INFRASTRUCTURE_NO_ERROR = QNN_SUCCESS, + QNN_HTA_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0, + QNN_HTA_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1, + QNN_HTA_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2, + QNN_HTA_PERF_INFRASTRUCTURE_ERROR_TRANSPORT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3, + + //////////////////////////////////////////////////////////////////////// + QNN_HTA_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE +} QnnHtaPerfInfrastructure_Error_t; + +/** + * @brief This enum defines all the possible performance + * options in Hta Performance Infrastructure that + * relate to setting up of power levels + */ +typedef enum { + /// config enum implies the usage of powerModeConfig struct. If not provided + /// will be used as type identificator + QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_POWER_MODE = 1, + /// UNKNOWN config option which must not be used + QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnHtaPerfInfrastructure_PowerConfigOption_t; + +/** + * @brief This enum defines all the possible power mode + * that a client can set + */ +typedef enum { + /// default mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_DEFAULT = 0, + /// low power saver mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_LOW_POWER_SAVER = 1, + /// power saver mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER = 2, + /// high power saver mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_HIGH_POWER_SAVER = 3, + /// balanced mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_BALANCED = 4, + /// high performance mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_HIGH_PERFORMANCE = 5, + /// burst mode + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_BURST = 6, + /// UNKNOWN value that must not be used by client + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff +} QnnHtaPerfInfrastructure_PowerMode_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of power levels + */ +typedef struct { + QnnHtaPerfInfrastructure_PowerConfigOption_t config; + // Organize as union for future expand flexibility defined by PowerConfigOption_t + union { + QnnHtaPerfInfrastructure_PowerMode_t powerModeConfig; + }; +} QnnHtaPerfInfrastructure_PowerConfig_t; + +/// QnnHtaPerfInfrastructure_PowerConfig_t initializer macro +#define QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT \ + { \ + QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN /*powerModeConfig*/ \ + } \ + } + +//============================================================================= +// API Methods +//============================================================================= + +/** + * @brief This API allows client to set up system power configuration that + * will enable different performance modes. + * + * @param[in] clientId A power client id to associate calls to system + * power settings. A value of 0 implies NULL power client id + * and can override every other setting the user process. To + * enable power settings for multiple clients in the same + * process, use a non-zero power client id. + * + * + * @param[in] config Pointer to a NULL terminated array + * of config option for performance configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + */ +typedef Qnn_ErrorHandle_t (*QnnHtaPerfInfrastructure_SetPowerConfigFn_t)( + uint32_t clientId, const QnnHtaPerfInfrastructure_PowerConfig_t** config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // QNN_HTA_PERF_INFRASTRUCTURE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h new file mode 100755 index 0000000000000..f069dbbedf6b7 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h @@ -0,0 +1,199 @@ +//============================================================================== +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief QNN HTA Profile component API. + * + * Requires HTA backend to be initialized. + * Should be used with the QnnProfile API but has HTA backend + * specific definition for different QnnProfile data structures + * + */ + +#ifndef QNN_HTA_PROFILE_H +#define QNN_HTA_PROFILE_H + +#include "QnnProfile.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary host time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_TIME_MICROSEC 1002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTA processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary HTA time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HTA_TIME_MICROSEC 1003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to create the context on the + * accelerator when client invokes QnnContext_createFromBinary. + * The value returned is time in microseconds. + * + * @note context load binary accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize host time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_TIME_MICROSEC 2001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTA processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize HTA time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HTA_TIME_MICROSEC 2002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to finalize the graph on the accelerator + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute host time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_TIME_MICROSEC 3001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTA processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute HTA time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HTA_TIME_MICROSEC 3002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is number of processor cycles taken. + * + * @note graph execute accelerator time maybe available only on + * QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for miscellaneous work i.e. time + * that cannot be attributed to a node but are still needed to + * execute the graph on the accelerator. This occurs when client invokes + * QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute misc accelerator time is available only on + * QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit host time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_TIME_MICROSEC 4001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTA processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit HTA time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_HTA_TIME_MICROSEC 4002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to deinit graph on the + * accelerator when client invokes QnnContext_free which in consequence + * deinit graph. The value returned is time in microseconds. + * + * @note graph deinit accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003 + +#ifdef __cplusplus +} +#endif + +#endif // QNN_HTA_PROFILE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h new file mode 100755 index 0000000000000..8b1d458a04b8e --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h @@ -0,0 +1,98 @@ +//============================================================================= +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTP Common components + * + * This file defines versioning and other identification details + * and supplements QnnCommon.h for HTP backend + */ + +#ifndef QNN_HTP_COMMON_H +#define QNN_HTP_COMMON_H + +#include "QnnCommon.h" + +/// HTP Backend identifier +#define QNN_BACKEND_ID_HTP 6 + +/// HTP interface provider +#define QNN_HTP_INTERFACE_PROVIDER_NAME "HTP_QTI_AISW" + +// HTP API Version values +#define QNN_HTP_API_VERSION_MAJOR 5 +#define QNN_HTP_API_VERSION_MINOR 34 +#define QNN_HTP_API_VERSION_PATCH 0 + +// clang-format off + +/// Macro to set Qnn_ApiVersion_t for HTP backend +#define QNN_HTP_API_VERSION_INIT \ + { \ + { \ + QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_MINOR, /*coreApiVersion.major*/ \ + QNN_API_VERSION_PATCH /*coreApiVersion.major*/ \ + }, \ + { \ + QNN_HTP_API_VERSION_MAJOR, /*backendApiVersion.major*/ \ + QNN_HTP_API_VERSION_MINOR, /*backendApiVersion.minor*/ \ + QNN_HTP_API_VERSION_PATCH /*backendApiVersion.patch*/ \ + } \ + } + +// clang-format on + +// DSP Context blob Version values +#define QNN_HTP_CONTEXT_BLOB_VERSION_MAJOR 3 +#define QNN_HTP_CONTEXT_BLOB_VERSION_MINOR 2 +#define QNN_HTP_CONTEXT_BLOB_VERSION_PATCH 3 + +/* ==== CDSP Security Library Versioning ==== */ +/* ==== This information is only intended for OEMs ==== */ + +/* Security versioning for DSP libraries is supported V73 onwards */ +#define QNN_HTP_NATIVE_LIB_SECURITY_VERSIONING_MIN_ARCH 73 + +/* Here we will define CDSP library versions for different targets + * Version is increased whenever there is a security fix from CDSP + * The versioning will start from 1.0.0 for each new target + * */ + +/* V73 Security Issues: + * List of security issues fixed for V73 and the fixed version + * */ +#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_MAJOR 1 +#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_MINOR 0 +#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_PATCH 0 + +/* V75 Security Issues: + * List of security issues fixed for V75 and the fixed version + * */ +// HTP Native library version values for V75 +#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_MAJOR 1 +#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_MINOR 0 +#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_PATCH 0 + +/* V79 Security Issues: + * List of security issues fixed for V79 and the fixed version + * */ +// HTP Native library version values for V79 +#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_MAJOR 1 +#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_MINOR 0 +#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_PATCH 0 + +/* V81 Security Issues: + * List of security issues fixed for V81 and the fixed version + * */ +// HTP Native library version values for V81 +#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_MAJOR 1 +#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_MINOR 0 +#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_PATCH 0 + +#endif // QNN_HTP_COMMON_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h new file mode 100755 index 0000000000000..8266817e2dc41 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h @@ -0,0 +1,164 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All rights reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc.s +// +//============================================================================== + +/** + * @file + * @brief QNN HTP component Context API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnContext.h for HTP backend + */ + +#ifndef QNN_HTP_CONTEXT_H +#define QNN_HTP_CONTEXT_H + +#include "QnnContext.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different HTP context configuration + * options associated with QnnContext + */ +typedef enum { + QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED = 1, + QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS = 2, + QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET = 3, + QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED = 4, + QNN_HTP_CONTEXT_CONFIG_OPTION_SHARE_RESOURCES = 5, + QNN_HTP_CONTEXT_CONFIG_OPTION_IO_MEM_ESTIMATION = 6, + QNN_HTP_CONTEXT_CONFIG_OPTION_PREPARE_ONLY = 7, + QNN_HTP_CONTEXT_CONFIG_OPTION_INIT_ACCELERATION = 8, + QNN_HTP_CONTEXT_CONFIG_OPTION_SKIP_VALIDATION_ON_BINARY_SECTION = 9, + QNN_HTP_CONTEXT_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnHtpContext_ConfigOption_t; + +typedef struct { + // Handle referring to the first context associated to a group. When a new + // group is to be registered, the following value must be 0. + Qnn_ContextHandle_t firstGroupHandle; + // Max spill-fill buffer to be allocated for the group of context in bytes. + // The value that is passed during the registration of the first context to + // a group is taken. Subsequent configuration of this value is disregarded. + uint64_t maxSpillFillBuffer; +} QnnHtpContext_GroupRegistration_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +// clang-format off + +/** + * @brief Structure describing the set of configurations supported by context. + * Objects of this type are to be referenced through QnnContext_CustomConfig_t. + * + * The struct has two fields - option and a union of config values + * Based on the option corresponding item in the union can be used to specify + * config. + * + * Below is the Map between QnnHtpContext_CustomConfig_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+=====================================================================+=======================================+ + * | 1 | QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED | bool | + * +====+=====================================================================+=======================================+ + * | 2 | QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS | QnnHtpContext_GroupRegistration_t | + * +====+=====================================================================+=======================================+ + * | 3 | QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET | uint64_t | + * +====+=====================================================================+=======================================+ + * | 4 | QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED | bool | + * +====+=====================================================================+=======================================+ + * | 5 | QNN_HTP_CONTEXT_CONFIG_OPTION_SHARE_RESOURCES | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | 6 | QNN_HTP_CONTEXT_CONFIG_OPTION_IO_MEM_ESTIMATION | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | 7 | QNN_HTP_CONTEXT_CONFIG_OPTION_PREPARE_ONLY | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | 8 | QNN_HTP_CONTEXT_CONFIG_OPTION_INIT_ACCELERATION | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * | 9 | QNN_HTP_CONTEXT_CONFIG_OPTION_SKIP_VALIDATION_ON_BINARY_SECTION | bool | + * +----+---------------------------------------------------------------------+---------------------------------------+ + * \endverbatim + */ +typedef struct QnnHtpContext_CustomConfig { + QnnHtpContext_ConfigOption_t option; + union UNNAMED { + // This field sets the weight sharing which is by default false + bool weightSharingEnabled; + QnnHtpContext_GroupRegistration_t groupRegistration; + // - Init time may be impacted depending the value set below + // - Value should be grather than 0 and less than or equal to the file size + // - If set to 0, the feature is not utilized + // - If set to greater than file size, min(fileSize, fileReadMemoryBudgetInMb) is used + // - As an example, if value 2 is passed, it would translate to (2 * 1024 * 1024) bytes + uint64_t fileReadMemoryBudgetInMb; + bool dspMemoryProfilingEnabled; + // This field enables resource sharing across different contexts, enhancing RAM and virtual + // address(VA) space utialization. When this flag is activated, graphs are expected to execute + // sequentially. Note that this configuration option is only supported when using the + // QnnContext_createFromBinaryListAsync API. + bool shareResources; + // This field enables I/O memory estimation during QnnContext_createFromBinary API when multiple + // PDs are available. When enabled, it estimates the total size of the I/O tensors required by + // the context to ensure sufficient space on the PD before deserialization. This feature helps + // with memory registration failures in large models. + // Note that enabling this feature increases peak RAM usage during context initialization phase + // in QnnContext_createFromBinary, but sustained RAM remains unaffected. + bool ioMemEstimation; + // This field enables model preparation without mapping its content on the DSP side. It is + // useful when a model needs to be prepared on the device but executed through a serialized + // binary method. This prevents extra mapping onto the DSP VA space. Set this flag only when + // creating the context. + bool isPrepareOnly; + // This field enables initialization acceleration, which is disabled by default. + // If set to true, the DSP will utilize all hardware threads to accelerate deserialization. + // It is not recommended to execute graphs simultaneously, as this will significantly degrade + // performance. + // Note that this feature may not be effective for small graphs with a few number of ops. + bool initAcceleration; + // This field enables crc32 check skip in Lora super adapter apply, which is disabled by default. + // If set to true, crc32 check for non-base adapter in super adapter apply use case will be + // skipped to improve time cost. + // Note that base adapter in super adaper never do crc32 check, therefore, their apply time cost + // won't improve by turning this config option on. + bool skipValidationOnBinarySection; + }; +} QnnHtpContext_CustomConfig_t; + +/// QnnHtpContext_CustomConfig_t initializer macro +#define QNN_HTP_CONTEXT_CUSTOM_CONFIG_INIT \ + { \ + QNN_HTP_CONTEXT_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + false /*weightsharing*/\ + } \ + } + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h new file mode 100755 index 0000000000000..e70c23577264b --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h @@ -0,0 +1,178 @@ +//============================================================================= +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** @file + * @brief QNN HTP Device components + * + * This file defines structures and supplements QnnDevice.h for QNN HTP device + */ + +#pragma once + +#include "QnnCommon.h" +#include "QnnDevice.h" +#include "QnnHtpPerfInfrastructure.h" +#include "QnnTypes.h" +#ifdef __cplusplus +extern "C" { +#endif + +/** + * This is used to represent the HTP hardware architecture + * Since QnnDevice only supports V68 or newer, using legacy ARCH will result in error + */ +typedef enum { + QNN_HTP_DEVICE_ARCH_NONE = 0, + QNN_HTP_DEVICE_ARCH_V68 = 68, + QNN_HTP_DEVICE_ARCH_V69 = 69, + QNN_HTP_DEVICE_ARCH_V73 = 73, + QNN_HTP_DEVICE_ARCH_V75 = 75, + QNN_HTP_DEVICE_ARCH_V79 = 79, + QNN_HTP_DEVICE_ARCH_V81 = 81, + QNN_HTP_DEVICE_ARCH_UNKNOWN = 0x7fffffff +} QnnHtpDevice_Arch_t; + +/** + * data struture to configure a device to set the minimum HTP Arch + * the driver will use ops that compatible to this HTP Arch + */ +typedef struct { + uint32_t deviceId; + QnnHtpDevice_Arch_t arch; +} QnnHtpDevice_Minimum_Arch_t; + +/** + * data struture to configure a device to running in Signed/unsigned Domain. + */ +typedef struct { + uint32_t deviceId; + bool useSignedProcessDomain; +} QnnHtpDevice_UseSignedProcessDomain_t; + +typedef void* QnnHtpDevice_UseCustomSetting_t; + +/** + * enum to list what custom configure is available. + */ +typedef enum { + QNN_HTP_DEVICE_CONFIG_OPTION_SOC = 0, + QNN_HTP_DEVICE_CONFIG_OPTION_ARCH = 1, + QNN_HTP_DEVICE_CONFIG_OPTION_SIGNEDPD = 2, + QNN_HTP_DEVICE_CONFIG_OPTION_CUSTOM = 3, + QNN_HTP_DEVICE_CONFIG_OPTION_RESERVED = 0x7fff0000, + QNN_HTP_DEVICE_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnHtpDevice_ConfigOption_t; + +/** + * Data structure for custom configure. + */ +typedef struct { + QnnHtpDevice_ConfigOption_t option; + union UNNAMED { + // This field set the SoC Model + uint32_t socModel; + // This field update the minimum HTP arch + QnnHtpDevice_Minimum_Arch_t arch; + // This structure is used for enable/disable Signed/unsigned PD + QnnHtpDevice_UseSignedProcessDomain_t useSignedProcessDomain; + // This structure is used for enable Custom setting + QnnHtpDevice_UseCustomSetting_t useCustomSetting; + // Reserved for internal purposes + void* reserved; + }; +} QnnHtpDevice_CustomConfig_t; + +// For deviceType in QnnDevice_HardwareDeviceInfoV1_t +typedef enum { + QNN_HTP_DEVICE_TYPE_ON_CHIP = 0, // HTP cores are inside SoC + QNN_HTP_DEVICE_TYPE_UNKNOWN = 0x7fffffff +} QnnHtpDevice_DeviceType_t; + +/** + * @brief QNN HTP Device core type + * This enumeration provides information about the core type inside the SOC. + * + * For online operation, the caller should retrieve this information from + * `QnnDevice_getPlatformInfo`. For offline operation, the caller needs to create a + * `QnnDevice_CoreInfo_t` with the correct core type, and then use it to create the + * `QnnDevice_PlatformInfo_t`. + */ +typedef enum { + QNN_HTP_CORE_TYPE_NSP = 0, + QNN_HTP_CORE_TYPE_HPASS = 1, + + // supported coreType are < QNN_CORE_TYPE_MAX + QNN_HTP_CORE_TYPE_MAX, + QNN_HTP_CORE_TYPE_UNKNOWN = 0x7fffffff +} QnnHtpDevice_CoreType_t; + +/** + * This structure provides info about the NSP device inside SoC + * For online operation, caller should get these info from QnnDevice_getPlatformInfo + * For offline operation, caller need to create this structure and filling the correct information + * for QnnDevice_create + */ +typedef struct { + size_t vtcmSize; // The VTCM for this device in Mega Byte + // user could not request VTCM size exceed this value + uint32_t socModel; // An enum value defined in Qnn Header that represent SoC model + bool signedPdSupport; // This field is true if the device supports Signed PD + bool dlbcSupport; // This field is true if the device supports DLBC + QnnHtpDevice_Arch_t arch; // This field shows the Architecture of this device +} QnnHtpDevice_OnChipDeviceInfoExtension_t; + +/** + * This structure is being used in QnnDevice_HardwareDeviceInfoV1_t + * QnnDevice_getPlatformInfo use this structure to list the supported device features/info + */ +typedef struct _QnnDevice_DeviceInfoExtension_t { + QnnHtpDevice_DeviceType_t devType; + union UNNAMED { + QnnHtpDevice_OnChipDeviceInfoExtension_t onChipDevice; + }; +} QnnHtpDevice_DeviceInfoExtension_t; + +/** + * @brief QNN HTP Device PerfInfrastructure specialization structure. + * Objects of this type are to be referenced through QnnDevice_getInfrastructure. + * + * Contains function pointers for each interface method for + * Htp PerfInfrastructure. + */ +typedef struct { + QnnHtpPerfInfrastructure_CreatePowerConfigIdFn_t createPowerConfigId; + QnnHtpPerfInfrastructure_DestroyPowerConfigIdFn_t destroyPowerConfigId; + QnnHtpPerfInfrastructure_SetPowerConfigFn_t setPowerConfig; + QnnHtpPerfInfrastructure_SetMemoryConfigFn_t setMemoryConfig; +} QnnHtpDevice_PerfInfrastructure_t; + +/// QnnHtpDevice_PerfInfrastructure_t initializer macro +#define QNN_HTP_DEVICE_PERF_INFRASTRUCTURE_INIT \ + { \ + NULL, /*createPowerConfigId*/ \ + NULL, /*destroyPowerConfigId*/ \ + NULL, /*setPowerConfig*/ \ + NULL /*setMemoryConfig*/ \ + } + +typedef enum { + QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF = 0, + QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_UNKNOWN = 0x7fffffff +} QnnHtpDevice_InfrastructureType_t; + +typedef struct _QnnDevice_Infrastructure_t { + QnnHtpDevice_InfrastructureType_t infraType; + union UNNAMED { + QnnHtpDevice_PerfInfrastructure_t perfInfra; + }; +} QnnHtpDevice_Infrastructure_t; + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h new file mode 100755 index 0000000000000..f7e49e9fb8bc3 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h @@ -0,0 +1,299 @@ +//============================================================================= +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================= + +/** + * @file + * @brief QNN HTP component Graph API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnGraph.h for HTP backend + */ + +#ifndef QNN_HTP_GRAPH_H +#define QNN_HTP_GRAPH_H + +#include "QnnGraph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief QnnHtpGraph config value macro. Represents to use the maximum + * available number of the resource. + * + * Currently only applicable for QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE. + */ +#define QNN_HTP_GRAPH_CONFIG_OPTION_MAX 0 + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief This enum provides different HTP graph optimization + * options that can be used to finalize the graph + * for optimum performance. + */ +typedef enum { + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD = 1, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES = 2, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG = 3, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC = 4, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC_WEIGHTS = 5, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SPARSE_WEIGHTS_COMPRESSION = 6, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR = 7, + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN = 0x7fffffff +} QnnHtpGraph_OptimizationType_t; + +// clang-format off + +/** + * @brief Struct describing the set of optimization types + * and the values associated with each optimization type. + * + * Below is the Map between QnnHtpGraph_OptimizationType_t and allowable values: + * + * \verbatim embed:rst:leading-asterisk + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | # | OptimizationType option | Allowable values | + * +====+====================================================================+=====================================================================+ + * | 1 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD | Reserved | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 2 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES | Reserved | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 3 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG | Defines the optimization strategy used by the HTP backend | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 4 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC | Reserved | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 5 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC_WEIGHTS | Enables DLBC weights compression | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 6 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SPARSE_WEIGHTS_COMPRESSION | Enables Weight Sparsity Compression | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * | 7 | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR | Enables System Level Cache Allocator usage | + * +----+--------------------------------------------------------------------+---------------------------------------------------------------------+ + * \endverbatim + */ +typedef struct { + QnnHtpGraph_OptimizationType_t type; + float floatValue; +} QnnHtpGraph_OptimizationOption_t; + +/** + * @brief This struct encapsulates all the VTCM configurations for parallel graph execution. + * + * @code + * |<-- (1) 8MB Total Hardware VTCM -->| + * |<-- (2) 7MB Addressable -->| + * +------+------+------+------+------+------+------+------+ + * | CV | | | | | | | | + * +------+------+------+------+------+------+------+------+ + * |<-- (4) Graph A -->|<-- (4) Graph B -->| + * + * A |> 0 MB (3) Graph Offset + * B |-------------------> 3 MB + * @endcode + */ +typedef struct { + /// (4) above, the amount of VTCM used by a graph + uint32_t sizeInBytes; + /// (3) above, where in the addressable region to start VTCM. + /// Note: (3) + (4) <= (2) + uint32_t offsetInBytes; + /// (2) Addressable portion of VTCM. + /// Set to less than hardware size so Graph(s) can coexist with other VTCM clients. + uint32_t sizeTotalInBytes; + + // For ABI compatibility in the future. + // Set to 0 for now. + uint32_t reserved[3]; +} QnnHtpGraph_VtcmConfig_t; + +/** + * @brief This enum defines whether graph concurrency (i.e. multiple graphs running concurrently) + * is possible, and how to behave when circumstances for concurrency aren't possible. + */ +typedef enum { + /// This graph will not be able to run concurrently with other graphs. + QNN_HTP_GRAPH_CONCURRENCY_OPTION_NONE = 0, + QNN_HTP_GRAPH_CONCURRENCY_OPTION_DEFAULT = QNN_HTP_GRAPH_CONCURRENCY_OPTION_NONE, + /// Graph will try to run concurrently, sharing all resources on the DSP (VTCM, HMX, HVX, etc). + QNN_HTP_GRAPH_CONCURRENCY_OPTION_ALL_SHARED = 1, + // Unused, present to ensure 32 bits. + QNN_HTP_GRAPH_CONCURRENCY_OPTION_UNKNOWN = 0x7fffffff +} QnnHtpGraph_ConcurrencyOption_t; + +/** + * @brief This struct encapsulates all the configurations for parallel graph execution. + */ +typedef struct { + QnnHtpGraph_ConcurrencyOption_t concurrency; + QnnHtpGraph_VtcmConfig_t vtcmConfig; + + // For ABI compatibility in the future. + // Set to 0 for now. + uint32_t reserved[4]; +} QnnHtpGraph_ParallelGraphExecutionConfig_t; +/// The settings in this struct is only applicable +/// for DSP architectures >= V81. +/// Use on other SOCs will return an error. +/// +/// Values will be defaulted to their SOC's TURBO frequency +/// (SOC as identified by Qnn_DeviceHandle_t). +/// +/// On automotive SDKs HMX OP Bounding will be enabled by default. +/// +/// On non-automotive SDKs using this setting will enable +/// HMX OP Bounding. It is off by default. +typedef struct QnnHtp_HmxBoundingInfo { + /// Target HMX freq in Hz. + /// Can be derived from sysMonApp (HexagonSDK) or QProfiler. + float targetHmxFreqHz; + /// Target DSP Core freq in Hz. + /// Can be derived from sysMonApp (HexagonSDK) or QProfiler. + float targetDspCoreFreq; +} QnnHtp_HmxBoundingInfo_t; + +/// QnnHtpGraph_OptimizationOption_t initializer macro +#define QNN_HTP_GRAPH_OPTIMIZATION_OPTION_INIT \ + { \ + QNN_HTP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/ \ + 0.0f /*floatValue*/ \ + } +// clang-format on + +/** + * @brief This enum provides different HTP graph configuration + * options associated with QnnGraph + */ +typedef enum { + QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1, + QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION = 2, + QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB = 3, + QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB, + QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 4, + QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF = 5, + QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS = 6, + QNN_HTP_GRAPH_CONFIG_OPTION_FINALIZE_CONFIG = 7, + QNN_HTP_GRAPH_CONFIG_OPTION_NUM_CORES = 8, + QNN_HTP_GRAPH_CONFIG_OPTION_PARALLEL_GRAPH_EXECUTION_CONFIG = 9, + QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_BYTES = 10, + QNN_HTP_GRAPH_CONFIG_OPTION_HMX_BOUNDING = 11, + QNN_HTP_GRAPH_CONFIG_OPTION_WEIGHTS_PACKING = 12, + QNN_HTP_GRAPH_CONFIG_OPTION_ASSUME_SAME_QUANT = 13, + QNN_HTP_GRAPH_CONFIG_OPTION_RESERVED = 0x7fff0000, + QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN = 0x7fffffff +} QnnHtpGraph_ConfigOption_t; + +//============================================================================= +// Public Functions +//============================================================================= + +//------------------------------------------------------------------------------ +// Implementation Definition +//------------------------------------------------------------------------------ + +/** + * @brief A struct for different config parameters in a key value format. + */ +typedef struct { + const char* key; + Qnn_Scalar_t value; +} QnnHtpGraph_FinalizeConfig_t; + +/** + * @brief Structure describing the set of configurations supported by graph. + * Objects of this type are to be referenced through QnnGraph_CustomConfig_t. + * + * The struct has two fields - option and a union of corresponding config values + * Based on the option corresponding item in the union can be used to specify + * config. + * + * Below is the Map between QnnHtpGraph_ConfigOption_t and config value + * + * \verbatim embed:rst:leading-asterisk + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | # | Config Option | Configuration Struct/value | + * +====+=====================================================================================+================================================+ + * | 1 | QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnHtpGraph_OptimizationOption_t + * | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 2 | QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION | Qnn_Precision_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 3 | + * QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB/QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 4 | QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF | bool | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 5 | QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF | bool | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 6 | QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 7 | QNN_HTP_GRAPH_CONFIG_OPTION_FINALIZE_CONFIG | QnnHtpGraph_FinalizeConfig_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 8 | QNN_HTP_GRAPH_CONFIG_OPTION_NUM_CORES | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 9 | QNN_HTP_GRAPH_CONFIG_OPTION_PARALLEL_GRAPH_EXECUTION_CONFIG | + * QnnHtpGraph_ParallelGraphExecutionConfig_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 10 | QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_BYTES | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 11 | QNN_HTP_GRAPH_CONFIG_OPTION_HMX_BOUNDING | uint32_t | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 12 | QNN_HTP_GRAPH_CONFIG_OPTION_WEIGHTS_PACKING | bool | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * | 13 | QNN_HTP_GRAPH_CONFIG_OPTION_ASSUME_SAME_QUANT | bool | + * +----+-------------------------------------------------------------------------------------+------------------------------------------------+ + * +-------------------------+----------------------------------------------------------------+------------------------------------------------+ + * | 0x7fff0000 - 0x7ffffffe | QNN_HTP_GRAPH_CONFIG_OPTION_RESERVED | These are + * reserved for internal purposes | + * +-------------------------+----------------------------------------------------------------+------------------------------------------------+ + * \endverbatim + * + * NOTE: Option #6 (i.e. QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS), can only be + * set prior to the first execution of the graph. Proceeding executions will not use + * the updated value if user does change it after the first execution. + */ +typedef struct { + QnnHtpGraph_ConfigOption_t option; + union { + QnnHtpGraph_OptimizationOption_t optimizationOption; + Qnn_Precision_t precision; + uint32_t vtcmSizeInMB; + bool foldReluActivationIntoConvOff; + bool shortDepthConvOnHmxOff; + uint64_t numHvxThreads; + void* reserved; + QnnHtpGraph_FinalizeConfig_t finalizeConfig; + uint32_t numCores; + QnnHtpGraph_ParallelGraphExecutionConfig_t parallelGraphExecutionConfig; + uint32_t vtcmSizeInBytes; + QnnHtp_HmxBoundingInfo_t hmxBoundingInfo; + bool weightsPacking; + bool assumeSameQuant; + }; +} QnnHtpGraph_CustomConfig_t; + +// clang-format on +/// QnnHtpGraph_CustomConfig_t initializer macro +#define QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT \ + { \ + QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/ \ + { \ + QNN_HTP_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \ + } \ + } + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h new file mode 100755 index 0000000000000..adc9ef2c52504 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h @@ -0,0 +1,85 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef QNN_HTP_MEMORY_INFRASTRUCTURE_2_H +#define QNN_HTP_MEMORY_INFRASTRUCTURE_2_H + +#include "QnnCommon.h" + +/** + * @file + * @brief QNN HTP Memory Infrastructure component API. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// VTCM +//============================================================================= + +// clang-format off + +/** + * @brief Raw memory address that exists ONLY on the QURT + * side. + */ +typedef uint32_t QnnHtpMem_QurtAddress_t; + +/** + * @brief Configuration for custom shared buffer memory type + * This shared buffer is a contiguous chunk of memory identified + * by a single file descriptor which will be used by multiple tensors + * based on the offset provided + * Each QnnMem_register call with different offset will return a + * unique memory handle + */ +typedef struct { + // File descriptor for memory, must be set to QNN_MEM_INVALID_FD if not applicable + int32_t fd; + // Offset to be used in contiguous shared buffer + uint64_t offset; +} QnnHtpMem_SharedBufferConfig_t; + +// clang-format off + +/** + * @brief QNN Memory Type + */ +typedef enum { + QNN_HTP_MEM_QURT = 0, + QNN_HTP_MEM_SHARED_BUFFER = 1, + QNN_HTP_MEM_UNDEFINED = 0x7FFFFFFF +} QnnHtpMem_Type_t; + +// clang-format off + +/** + * @brief descriptor used for the QNN API + */ +typedef struct { + // Memory type identified by QnnHtpMem_Type_t + QnnHtpMem_Type_t type; + // Total size of the buffer + // For memory type QURT, it would be size of a tensor + // For memory type SHARED BUFFER, it would be the total size of the buffer + uint64_t size; + + union { + QnnHtpMem_QurtAddress_t qurtAddress; + QnnHtpMem_SharedBufferConfig_t sharedBufferConfig; + }; +} QnnMemHtp_Descriptor_t; + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h new file mode 100755 index 0000000000000..f92317ac94bf2 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h @@ -0,0 +1,511 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** @file + * @brief QNN HTP component Performance Infrastructure API + * + * Provides interface to the client to control performance and system + * settings of the QNN HTP Accelerator + */ + +#ifndef QNN_HTP_PERF_INFRASTRUCTURE_H +#define QNN_HTP_PERF_INFRASTRUCTURE_H + +#include "QnnCommon.h" +#include "QnnTypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// max rpc polling time allowed - 9999 us +#define QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME 9999 + +//============================================================================= +// Data Types +//============================================================================= + +/** + * @brief QNN HTP PerfInfrastructure API result / error codes. + * + */ +typedef enum { + QNN_HTP_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE, + //////////////////////////////////////////////////////////////////////// + + QNN_HTP_PERF_INFRASTRUCTURE_NO_ERROR = QNN_SUCCESS, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 4, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_MEM_ALLOC = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 5, + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_FAILED = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 6, + + //////////////////////////////////////////////////////////////////////// + QNN_HTP_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE, + /// UNDEFINED value that must not be used by client + QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNDEFINED = 0x7fffffff +} QnnHtpPerfInfrastructure_Error_t; + +/** + * @brief Allows client to consider (non-zero value) DCVS enable/disable + * and option parameters, otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetDcvsEnable_t; + +/** + * @brief Allows client to start (non-zero value) or stop (zero value) + * participating in DCVS + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_DcvsEnable_t; + +/** + * @brief Allows client to consider (non-zero value) latency parameter, + * otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetSleepLatency_t; + +/** + * @brief Allows client to set up the sleep latency in microseconds + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SleepLatency_t; + +/** + * @brief Allows client to consider (non-zero value) sleep disable + * parameter, otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetSleepDisable_t; + +/** + * @brief Allows client to disable sleep or low power modes. + * Pass a non-zero value to disable sleep in HTP + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SleepDisable_t; + +/** + * @brief Allows client to consider (non-zero value) bus clock + * params, otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetBusParams_t; + +/** + * @brief Allows client consider (non-zero value) core clock + * params, otherwise (zero value) + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_SetCoreParams_t; + +/** + * @brief Allows client to set up the RPC control latency in microseconds + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_RpcControlLatency_t; + +/** + * @brief Allows client to set up the RPC polling time in microseconds + */ +typedef uint32_t QnnHtpPerfInfrastructure_RpcPollingTime_t; + +/** + * @brief Allows client to set up the adaptive polling time in microseconds + */ +typedef uint32_t QnnHtpPerfInfrastructure_AdaptivePollingTime_t; + +/** + * @brief Allows client to set up the HMX timeout interval in microseconds + */ +typedef uint32_t QnnHtpPerfInfrastructure_HmxTimeoutIntervalUs_t; + +/** + * @brief sets the minimum size by which user heap should grow + * when heap is exhausted. This API is expected to be + * called only once per backend and has a process wide impact + * + * Grow size provided in bytes and defaults to 16MB + */ +typedef uint32_t QnnHtpPerfInfrastructure_MemGrowSize_t; + +/** + * @brief Allows client to set default values for HMX frequency. + * If enabled 1 HMX vote will scale with DCVS Corner if 0 HMX vote + * needs to be specified manually. + * + */ +typedef uint32_t QnnHtpPerfInfrastructure_HmxDefault_Vote_t; + +/** + * @brief Perf modes to specify clock frequency level within + * target voltage corner currently applies only for HMX config. + */ +typedef enum { + // To select max frequency at target voltage corner. + QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH = 0, + // To select min frequency at target voltage corner. + QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_LOW, + /// UNKNOWN value that must not be used by client + QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_ClkPerfMode_t; + +/** + * @brief These are the different voltage corners that can + * be requested by the client to influence the voting scheme + * for DCVS + * + */ +typedef enum { + /// Maps to HAP_DCVS_VCORNER_DISABLE. + /// Disable setting up voltage corner + DCVS_VOLTAGE_CORNER_DISABLE = 0x10, + /// Maps to HAP_DCVS_VCORNER_SVS2. + /// Set voltage corner to minimum value supported on platform + DCVS_VOLTAGE_VCORNER_MIN_VOLTAGE_CORNER = 0x20, + /// Maps to HAP_DCVS_VCORNER_SVS2. + /// Set voltage corner to SVS2 value for the platform + DCVS_VOLTAGE_VCORNER_SVS2 = 0x30, + /// Maps to HAP_DCVS_VCORNER_SVS. + /// Set voltage corner to SVS value for the platform + DCVS_VOLTAGE_VCORNER_SVS = 0x40, + /// Maps to HAP_DCVS_VCORNER_SVS_PLUS. + /// Set voltage corner to SVS_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_SVS_PLUS = 0x50, + /// Maps to HAP_DCVS_VCORNER_NOM. + /// Set voltage corner to NOMINAL value for the platform + DCVS_VOLTAGE_VCORNER_NOM = 0x60, + /// Maps to HAP_DCVS_VCORNER_NOM_PLUS. + /// Set voltage corner to NOMINAL_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_NOM_PLUS = 0x70, + /// Maps to HAP_DCVS_VCORNER_TURBO. + /// Set voltage corner to TURBO value for the platform + DCVS_VOLTAGE_VCORNER_TURBO = 0x80, + /// Maps to HAP_DCVS_VCORNER_TURBO_PLUS. + /// Set voltage corner to TURBO_PLUS value for the platform + DCVS_VOLTAGE_VCORNER_TURBO_PLUS = 0x90, + /// Maps to HAP_DCVS_VCORNER_TURBO_L2. + /// Set voltage corner to TURBO_L2 value for the platform + DCVS_VOLTAGE_VCORNER_TURBO_L2 = 0x92, + /// Maps to HAP_DCVS_VCORNER_TURBO_L3. + /// Set voltage corner to TURBO_L3 value for the platform + DCVS_VOLTAGE_VCORNER_TURBO_L3 = 0x93, + /// Maps to HAP_DCVS_VCORNER_MAX. + /// Set voltage corner to maximum value supported on the platform + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER = 0xA0, + /// UNKNOWN value that must not be used by client + DCVS_VOLTAGE_VCORNER_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_VoltageCorner_t; + +/** + * @brief These are the expanded voltage corners that can + * be requested by the client to influence the voting scheme + * for DCVS + * + */ +typedef enum { + /// Maps to HAP_DCVS_EXP_VCORNER_DISABLE. + /// Disable setting up voltage corner + DCVS_EXP_VCORNER_DISABLE = 0, + /// Maps to HAP_DCVS_EXP_VCORNER_MIN. + /// Set voltage corner to minimum value supported on platform + DCVS_EXP_VCORNER_MIN = 0x100, + /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS_D2. + /// Set voltage corner to LOWSVS_D2 value for the platform + DCVS_EXP_VCORNER_LOW_SVS_D2 = 0x134, + /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS_D1. + /// Set voltage corner to LOWSVS_D1 value for the platform + DCVS_EXP_VCORNER_LOW_SVS_D1 = 0x138, + /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS. + /// Set voltage corner to LOWSVS value for the platform + DCVS_EXP_VCORNER_LOW_SVS = 0x140, + /// Maps to HAP_DCVS_EXP_VCORNER_SVS. + /// Set voltage corner to SVS value for the platform + DCVS_EXP_VCORNER_SVS = 0x180, + /// Maps to HAP_DCVS_EXP_VCORNER_SVS_L1. + /// Set voltage corner to SVS_L1 value for the platform + DCVS_EXP_VCORNER_SVS_L1 = 0x1C0, + /// Maps to HAP_DCVS_EXP_VCORNER_NOM. + /// Set voltage corner to NOM value for the platform + DCVS_EXP_VCORNER_NOM = 0x200, + /// Maps to HAP_DCVS_EXP_VCORNER_NOM_L1. + /// Set voltage corner to NOM_L1 value for the platform + DCVS_EXP_VCORNER_NOM_L1 = 0x240, + /// Maps to HAP_DCVS_EXP_VCORNER_TUR. + /// Set voltage corner to TURBO value for the platform + DCVS_EXP_VCORNER_TUR = 0x280, + /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L1. + /// Set voltage corner to TURBO_L1 value for the platform + DCVS_EXP_VCORNER_TUR_L1 = 0x2A0, + /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L2. + /// Set voltage corner to TURBO_L2 value for the platform + DCVS_EXP_VCORNER_TUR_L2 = 0x2B0, + /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L3. + /// Set voltage corner to TURBO_L3 value for the platform + DCVS_EXP_VCORNER_TUR_L3 = 0x2C0, + /// Maps to HAP_DCVS_EXP_VCORNER_MAX. + /// Selects the maximum voltage corner defined for the chipset + DCVS_EXP_VCORNER_MAX = 0xFFFF, + /// UNKNOWN value that must not be used by client + DCVS_EXP_VCORNER_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_ExpVoltageCorner_t; + +/** + * @brief This enum defines all the possible power mode + * that a client can set to influence DCVS mode + */ +typedef enum { + /// Maps to HAP_DCVS_V2_ADJUST_UP_DOWN. + /// Allows for DCVS to adjust up and down + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN = 0x1, + /// Maps to HAP_DCVS_V2_ADJUST_ONLY_UP. + /// Allows for DCVS to adjust up only + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_ONLY_UP = 0x2, + /// Maps to HAP_DCVS_V2_POWER_SAVER_MODE. + /// Higher thresholds for power efficiency + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE = 0x4, + /// Maps to HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE. + /// Higher thresholds for power efficiency with faster ramp down + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_AGGRESSIVE_MODE = 0x8, + /// Maps to HAP_DCVS_V2_PERFORMANCE_MODE. + /// Lower thresholds for maximum performance + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE = 0x10, + /// Maps to HAP_DCVS_V2_DUTY_CYCLE_MODE. + /// The below value applies only for HVX clients: + /// - For streaming class clients: + /// - detects periodicity based on HVX usage + /// - lowers clocks in the no HVX activity region of each period. + /// - For compute class clients: + /// - Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity + /// again. + /// - Latency involved in bringing up the clock will be at max 1 to 2 ms. + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_DUTY_CYCLE_MODE = 0x20, + /// UNKNOWN value that must not be used by client + QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_PowerMode_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of DcvsV3 which allows to select + * bus and core operating corners separately + */ +typedef struct { + uint32_t contextId; + QnnHtpPerfInfrastructure_SetDcvsEnable_t setDcvsEnable; + QnnHtpPerfInfrastructure_DcvsEnable_t dcvsEnable; + QnnHtpPerfInfrastructure_PowerMode_t powerMode; + QnnHtpPerfInfrastructure_SetSleepLatency_t setSleepLatency; + QnnHtpPerfInfrastructure_SleepLatency_t sleepLatency; + QnnHtpPerfInfrastructure_SetSleepDisable_t setSleepDisable; + QnnHtpPerfInfrastructure_SleepDisable_t sleepDisable; + QnnHtpPerfInfrastructure_SetBusParams_t setBusParams; + QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerMin; + QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerTarget; + QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerMax; + QnnHtpPerfInfrastructure_SetCoreParams_t setCoreParams; + QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerMin; + QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerTarget; + QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerMax; +} QnnHtpPerfInfrastructure_DcvsV3_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of hmxv2 which allows to select + * hmx corner separately. If hmxPickDefault is 1 all voltage corner + * params will be ignored. Ensure to use same contextID as used for + * DCVS vote. + */ +typedef struct { + QnnHtpPerfInfrastructure_HmxDefault_Vote_t hmxPickDefault; + QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerMin; + QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerTarget; + QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerMax; + QnnHtpPerfInfrastructure_ClkPerfMode_t hmxPerfMode; +} QnnHtpPerfInfrastructure_HmxV2_t; + +/** + * @brief This enum defines all the possible performance + * options in Htp Performance Infrastructure that + * relate to setting up of power levels + */ +typedef enum { + /// config enum implies the usage of Dcvs v3 + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3 = 1, + /// config enum implies the usage of rpcControlLatencyConfig struct + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY = 2, + /// config enum implies the usage of rpcPollingTimeConfig struct + /// this config is only supported on V69 and later + /// if enabled, this config is applied to entire process + /// max allowed is QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME us + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME = 3, + /// config HMX timeout interval in us. The HMX is turned off after the set interval + /// time if no interaction with it after an inference is finished. + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_TIMEOUT_INTERVAL_US = 4, + /// config HMX V2 voting parameters only on supported chips + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2 = 5, + /// config enum implies the usage of adaptivePollingTime struct + /// this config can only be enabled in the RPC polling mode + /// if enabled, this config is applied to the entire process + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_ADAPTIVE_POLLING_TIME = 6, + /// UNKNOWN config option which must not be used + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_PowerConfigOption_t; + +/** + * @brief This struct provides performance infrastructure configuration + * associated with setting up of power levels + */ +typedef struct { + QnnHtpPerfInfrastructure_PowerConfigOption_t option; + union UNNAMED { + QnnHtpPerfInfrastructure_DcvsV3_t dcvsV3Config; + QnnHtpPerfInfrastructure_RpcControlLatency_t rpcControlLatencyConfig; + QnnHtpPerfInfrastructure_RpcPollingTime_t rpcPollingTimeConfig; + QnnHtpPerfInfrastructure_HmxTimeoutIntervalUs_t hmxTimeoutIntervalUsConfig; + QnnHtpPerfInfrastructure_HmxV2_t hmxV2Config; + QnnHtpPerfInfrastructure_AdaptivePollingTime_t adaptivePollingTimeConfig; + }; +} QnnHtpPerfInfrastructure_PowerConfig_t; + +/// QnnHtpPerfInfrastructure_PowerConfig_t initializer macro +#define QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT \ + { \ + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*dcvsV3Config*/ \ + } \ + } + +/** + * @brief This enum defines all the possible performance + * options in Htp Performance Infrastructure that + * relate to system memory settings + */ +typedef enum { + /// sets memory grow size + QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE = 1, + /// UNKNOWN config option that must not be used + QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN = 0x7fffffff +} QnnHtpPerfInfrastructure_MemoryConfigOption_t; + +/** + * @brief Provides performance infrastructure configuration + * options that are memory specific + */ +typedef struct { + QnnHtpPerfInfrastructure_MemoryConfigOption_t option; + union UNNAMED { + QnnHtpPerfInfrastructure_MemGrowSize_t memGrowSizeConfig; + }; +} QnnHtpPerfInfrastructure_MemoryConfig_t; + +/// QnnHtpPerfInfrastructure_MemoryConfig_t initializer macro +#define QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIG_INIT \ + { \ + QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN, /*config*/ \ + { \ + 0 /*memGrowSizeConfig*/ \ + } \ + } + +//============================================================================= +// API Methods +//============================================================================= + +/** + * @brief This API allows client to create power configuration id that + * has to be used to set different performance modes. + * Power configuration id has to be destroyed by client when not needed. + * + * @param[in] deviceId Hardware Device on which this config id needs to be created. + * + * @param[in] coreId Core/NSP on which this config id needs to be created. + * + * @param[out] powerConfigId Pointer to power configuration id to be created. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if deviceId/coreId + * or power configuration id is NULL + */ +typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_CreatePowerConfigIdFn_t)( + uint32_t deviceId, uint32_t coreId, uint32_t* powerConfigId); + +/** + * @brief This API allows client to destroy power configuration id. + * + * @param[in] powerConfigId A power configuration id to be destroyed. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * id does not exist + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery) + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery) + */ +typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_DestroyPowerConfigIdFn_t)( + uint32_t powerConfigId); + +/** + * @brief This API allows client to set up system power configuration that + * will enable different performance modes. This API uses + * HAP_power_dcvs_v3_payload struct to config HAP power parameters. + * Detailed HAP power parameters description please refer to Hexagon + * SDK HAP_power_dcvs_v3_payload documentation. + * + * @param[in] powerConfigId A power client id to associate calls to system + * power settings. A value of 0 implies NULL power client id + * and can override every other setting the user process. To + * enable power settings for multiple clients in the same + * process, use a non-zero power client id. + * + * @param[in] config Pointer to a NULL terminated array + * of config option for performance configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration + * does not exist + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery) + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery) + */ +typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_SetPowerConfigFn_t)( + uint32_t powerConfigId, const QnnHtpPerfInfrastructure_PowerConfig_t** config); + +/** + * @brief This API allows clients to set up configuration associated with + * system memory on a specific device + * + * @param[in] deviceId Hardware Device on which this config needs to be applied. + * + * @param[in] coreId Core/NSP on which this config needs to be applied. + * + * @param[in] config Pointer to a NULL terminated array + * of config option for system memory configuration. + * NULL is allowed and indicates no config options are provided. + * + * @return Error code + * \n QNN_SUCCESS: No error encountered + * \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if deviceId/coreId + * or memory configuration does not exist + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery) + * \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery) + */ +typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_SetMemoryConfigFn_t)( + uint32_t deviceId, uint32_t coreId, const QnnHtpPerfInfrastructure_MemoryConfig_t** config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // QNN_HTP_PERF_INFRASTRUCTURE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h new file mode 100755 index 0000000000000..92381d17b0440 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h @@ -0,0 +1,567 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief QNN HTP Profile component API. + * + * Requires HTP backend to be initialized. + * Should be used with the QnnProfile API but has HTP backend + * specific definition for different QnnProfile data structures + * + */ + +#ifndef QNN_HTP_PROFILE_H +#define QNN_HTP_PROFILE_H + +#include "QnnProfile.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_RPC_TIME_MICROSEC 1002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTP processor + * when client invokes QnnContext_createFromBinary. The value + * returned is time in microseconds. + * + * @note context load binary htp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HTP_RPC_TIME_MICROSEC 1003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to create the context on the + * accelerator when client invokes QnnContext_createFromBinary. + * The value returned is time in microseconds. + * + * @note context load binary accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_RPC_TIME_MICROSEC 2001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTP processor + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize htp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HTP_RPC_TIME_MICROSEC 2002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to finalize the graph on the accelerator + * when client invokes QnnGraph_finalize. + * The value returned is time in microseconds. + * + * @note graph finalize accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003 + +/* Graph Performance Estimate Support + * + **/ +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to Performance Estimates for the graph + * when client invokes QnnGraph_finalize. + * This is just a dummy event which will print only the heading + * with no value or unit. + * @note HTP Performance Estimates maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE 2004 + +/** + * @brief QnnProfile_EventType_t definition to get perf mode at which + * the perf estimates are collected during QnnGraph_finalize. + * The value returned is the perf mode in string with no unit. + * + * @note Perf mode maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MODE 2005 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to simulated execution cycles during + * QnnGraph_finalize. + * The value returned is number of cycles. + * + * @note Simulated execution cycles maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_CYCLES 2006 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to a lower estimate of simulated execution + * cycles during QnnGraph_finalize. + * The value returned is number of cycles. + * + * @note Simulated execution cycles lower estimate maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_LOWER_CYCLES 2007 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to a upper estimate of simulated execution + * cycles during QnnGraph_finalize. + * The value returned is number of cycles. + * + * @note Simulated execution cycles upper estimate maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_UPPER_CYCLES 2008 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to DDR information for each HTP during + * QnnGraph_finalize. + * This is just a dummy event which will print only the heading + * with no value or unit. + * + * @note DDR Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_BANDWIDTH_STATS 2009 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the HTP ID on chip during QnnGraph_finalize. + * The value returned is the HTP ID with no unit. + * + * @note HTP ID's maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_BANDWIDTH_STATS_HTP_ID 2010 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the Graph defined inputs or the total reads + * (in bytes) from DDR for graph input related tensors (weights, + * bias, activations) which do not have predecessors. + * The value returned is the num of blocks in bytes. + * + * @note Graph defined inputs for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INPUT_FILL 2011 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total reads (in bytes) from DDR for + * compiler generated fill operators which have predecessors and + * successors and originate on the same HTP. + * The value returned is the num of blocks in bytes. + * + * @note Intermediate Fill Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTERMEDIATE_FILL 2012 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total writes (in bytes) from DDR for + * compiler generated fill operators which have predecessors and + * successors and originate on the same HTP. + * The value returned is the num of blocks in bytes. + * + * @note Intermediate Spill Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTERMEDIATE_SPILL 2013 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total reads (in bytes) from DDR for + * fills which were generated by a different HTP core and do not + * have a predecessor, but have a successor. + * The value returned is the num of blocks in bytes. + * + * @note Inter HTP Fill Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTER_HTP_FILL 2014 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total writes (in bytes) from DDR for + * fills which were generated by a different HTP core and do not + * have a successor, but have a predecessor. + * The value returned is the num of blocks in bytes. + * + * @note Inter HTP Spill Information for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTER_HTP_SPILL 2015 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total writes (in bytes) to DDR for + * graph output related tensors which do not have successors. + * The value returned is the num of blocks in bytes. + * + * @note Graph output related tensors for each HTP maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_OUTPUT_SPILL 2016 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the total number of missing ops which do + * not have any cost associated with them while getting the graph + * performance estimates. + * The value returned is the num of missing ops with no unit. + * + * @note Number of missing cost ops maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MISSING_COST_OPS 2017 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the op ids of the missing ops which do + * not have any cost associated with them while getting the graph + * performance estimates. + * The value returned is the opname along with the op id (decimal + * format) of the ops which does not have any costs associated + * with them. + * + * @note Opname and Op ids of missing cost ops are available only with + * QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MISSING_COST_OPID 2018 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_RPC_TIME_MICROSEC 3001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTP processor + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time in microseconds. + * + * @note graph execute htp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HTP_RPC_TIME_MICROSEC 3002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is number of processor cycles taken. + * + * @note graph execute accelerator time maybe available only on + * QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value indicates execute including wait/resource acquisition + * time on the accelerator, if applicable in multi-threaded scenarios. + * The value returned is time taken in microseconds + * + * @note graph execute accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for miscellaneous work i.e. time + * that cannot be attributed to a node but are still needed to + * execute the graph on the accelerator. This occurs when client invokes + * QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is time taken in microseconds + * + * @note graph execute misc accelerator time is available only on + * QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time taken for a graph yield instance to + * release all its resources to the other graph. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RELEASE_TIME 3006 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time a graph spends waiting for a higher + * priority graph to finish execution. + * The value returned is time taken in microseconds + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_WAIT_TIME 3007 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to time a graph spends re-acquiring resources + * and restoring vtcm. + * The value returned is time taken in microseconds + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RESTORE_TIME 3008 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the number of times that a yield occured + * during execution + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_COUNT 3009 + +/** + * @brief QnnProfile_EventType_t definition for time a graph waits to get + * VTCM. This should be constant UNLESS we need another graph to yield. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_VTCM_ACQUIRE_TIME 3010 + +/** + * @brief QnnProfile_EventType_t definition for time a graph waits to get + * HMX + HVX, and turn them all on. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_RESOURCE_POWER_UP_TIME 3011 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value indicates execute excluding wait/resource acquisition + * time on the accelerator, if applicable in multi-threaded scenarios. + * The value returned is time taken in microseconds + * + * @note graph execute accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + * + * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_EXCL_WAIT_TIME_MICROSEC 3012 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the ARM processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit host rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_RPC_TIME_MICROSEC 4001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the remote procedure call on the HTP processor + * when client invokes QnnContext_free which in consequence deinit graph. + * The value returned is time in microseconds. + * + * @note graph deinit htp rpc time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HTP_RPC_TIME_MICROSEC 4002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to the time taken to deinit graph on the + * accelerator when client invokes QnnContext_free which in consequence + * deinit graph. The value returned is time in microseconds. + * + * @note graph deinit accelerator time maybe available on both + * QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the amount of time an op spends + * waiting for execution on the main thread since the last op on the main + * thread due to scheduling and can be interpreted appropriately in + * conjunction with the unit. + * + * @note node wait information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT 5001 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the amount of time at least one + * background op is running during the execution of an op on the main thread + * and can be interpreted appropriately in conjunction with the unit. + * + * @note node overlap information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_OVERLAP 5002 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the amount of time at least one + * background op that is not being waited upon to finish is running during + * the wait period of an op on the main thread and can be interpreted + * appropriately in conjunction with the unit. + * + * @note node wait overlap information is available on QNN_HTP_PROFILE_LEVEL_LINTING + * level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT_OVERLAP 5003 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents a bitmask denoting the resources + * an op uses. + * + * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_RESOURCEMASK 5004 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the ID of an op running in parallel to + * an op running on the main thread or on HMX. + * + * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_CRITICAL_BG_OP_ID 5005 + +/** + * @brief QnnProfile_EventType_t definition to get data related to execution of + * an operation. This value represents the ID of an op running on threads other + * than the main or the HMX thread when the main and the HMX threads are not + * executing any op. + * + * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level + */ +#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT_BG_OP_ID 5006 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to execute the graph's critical path on the accelerator + * when client invokes QnnGraph_execute or QnnGraph_executeAsync. + * The value returned is number of processor cycles taken. + * + * @note graph execute accelerator time maybe available only on + * QNN_HTP_PROFILE_LEVEL_LINTING levels + * + * @note When QNN_HTP_PROFILE_LEVEL_LINTING is used, this event can have + * multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE. + * There will be a sub-event for each node that was added to the graph + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_CRITICAL_ACCEL_TIME_CYCLE 6001 + +/** + * @brief Linting QnnProfile_Level_t definition that allows collecting in-depth + * performance metrics for each op in the graph including main thread + * execution time and time spent on parallel background ops. + */ +#define QNN_HTP_PROFILE_LEVEL_LINTING 7001 + +/** + * @brief QnnProfile_EventType_t definition to get number of HVX threads + * configured by a graph. Different graphs can have a different + * value. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_NUMBER_OF_HVX_THREADS 8001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to applying binary section for updatable tensors + * when client invokes QnnContext_ApplyBinarySection. + * It refers to the total time the entire API takes. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_QNN 9001 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to applying binary section for updatable tensors + * when client invokes QnnContext_ApplyBinarySection. + * It refers to the time of callTransport. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_RPC 9002 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to applying binary section for updatable tensors + * when client invokes QnnContext_ApplyBinarySection. + * It refers to the remote procedure call on the HTP processor. + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_QNN_ACC 9003 + +/** + * @brief QnnProfile_EventType_t definition to get profile information + * that corresponds to applying binary section for updatable tensors + * when client invokes QnnContext_ApplyBinarySection. + * It refers to the Hexnn call + * The value returned is time taken in microseconds. + */ +#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_ACC 9004 + + + +#ifdef __cplusplus +} +#endif + +#endif // QNN_HTP_PROFILE_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h new file mode 100755 index 0000000000000..51440061dc611 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h @@ -0,0 +1,30 @@ +//============================================================================== +// +// Copyright (c) 2022 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef QNN_HTP_PROPERTY_H +#define QNN_HTP_PROPERTY_H + +#include "QnnProperty.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +/** + * @brief Property key for determining whether a backend supports unsigned pd. + */ +#define QNN_PROPERTY_CUSTOM_HTP_UNSIGNED_PD_SUPPORT QNN_PROPERTY_GROUP_CUSTOM + 1 + +#ifdef __cplusplus +} +#endif + +#endif // QNN_HTP_PROPERTY_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h new file mode 100755 index 0000000000000..dcfedcb3f6450 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h @@ -0,0 +1,119 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// All rights reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/** + * @file + * @brief QNN HTP component System Context API. + * + * The interfaces in this file work with the top level QNN + * API and supplements QnnSystemContext.h for HTP backend + */ + +#ifndef QNN_HTP_SYSTEM_CONTEXT_H +#define QNN_HTP_SYSTEM_CONTEXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +//============================================================================= +// Macros +//============================================================================= +typedef enum { + // Following version with hwInfoBlobVersion as: + // - Major 0, Minor: 0, Patch: 1 + QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1 = 0x01, + // Unused, present to ensure 32 bits. + QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_UNDEFINED = 0x7FFFFFFF +} QnnHtpSystemContext_HwInfoBlobVersion_t; + +// This struct is gets populated within a binary blob as part of hwInfoBlob in +// QnnSystemContext_BinaryInfoV#_t struct in QnnSystemContext.h +typedef struct QnnHtpSystemContext_HwBlobInfoV1 { + // This value represents the index of the list of graphs registered + // to this context as specified in QnnSystemContext_GraphInfo_t* + uint32_t graphListIndex; + // Stores the spill-fill buffer size used by each of the graphs + uint64_t spillFillBufferSize; +} QnnHtpSystemContext_HwBlobInfoV1_t; + +typedef struct { + QnnHtpSystemContext_HwInfoBlobVersion_t version; + union UNNAMED { + QnnHtpSystemContext_HwBlobInfoV1_t contextBinaryHwInfoBlobV1_t; + }; +} QnnHtpSystemContext_HwBlobInfo_t; + +typedef enum { + // Following version with GraphInfoBlobVersion as: + // - Major 0, Minor: 0, Patch: 1 + QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1 = 0x01, + // Unused, present to ensure 32 bits. + QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_UNDEFINED = 0x7FFFFFFF +} QnnHtpSystemContext_GraphInfoBlobVersion_t; + +// This struct is gets populated within a binary blob as part of GraphInfoBlob in +// QnnSystemContext_BinaryInfoV#_t struct in QnnSystemContext.h +typedef struct { + // Stores the spill-fill buffer size used by each of the graphs + uint64_t spillFillBufferSize; + // HTP vtcm size (MB) + uint32_t vtcmSize; + // Optimization level + uint32_t optimizationLevel; + // Htp Dlbc + uint8_t htpDlbc; + // Number of HVX Threads to reserve; + uint64_t numHvxThreads; +} QnnHtpSystemContext_GraphBlobInfoV1_t; + +typedef struct { + QnnHtpSystemContext_GraphInfoBlobVersion_t version; + union UNNAMED { + QnnHtpSystemContext_GraphBlobInfoV1_t contextBinaryGraphBlobInfoV1; + }; +} QnnHtpSystemContext_GraphBlobInfo_t; + +typedef enum { + // Following version with ContextInfoBlobVersion as: + // - Major 0, Minor: 0, Patch: 1 + QNN_SYSTEM_CONTEXT_HTP_CONTEXT_INFO_BLOB_VERSION_V1 = 0x01, + // Unused, present to ensure 32 bits. + QNN_SYSTEM_CONTEXT_HTP_CONTEXT_INFO_BLOB_UNDEFINED = 0x7FFFFFFF +} QnnHtpSystemContext_ContextInfoBlobVersion_t; + +typedef struct{ + /// An integer representation of SocUtility::DspArch + uint32_t dspArch; +} QnnHtpSystemContext_ContextBlobInfoV1_t; + +typedef struct { + QnnHtpSystemContext_ContextInfoBlobVersion_t version; + union UNNAMED { + QnnHtpSystemContext_ContextBlobInfoV1_t contextBinaryContextBlobInfoV1; + }; +} QnnHtpSystemContext_ContextBlobInfo_t; + +//============================================================================= +// Data Types +//============================================================================= + +//============================================================================= +// Public Functions +//============================================================================= + +//============================================================================= +// Implementation Definition +//============================================================================= + +// clang-format on +#ifdef __cplusplus +} // extern "C" +#endif + +#endif \ No newline at end of file diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h new file mode 100755 index 0000000000000..28b5685f29750 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h @@ -0,0 +1,338 @@ +//============================================================================== +// +// Copyright (c) 2018, 2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef AFUNCS_H +#define AFUNCS_H 1 + +#include +#include +#include "dtype.h" +#ifndef __hexagon__ +#include // for memcpy etc +#endif +// #include "asm_define.h" +#include "builtin_intrinsics.h" +#include "macros_attribute.h" + +struct tile_data { + uint8_t **addr; + uint32_t offset_t_col; + uint32_t offset_t_row; + uint32_t width; + uint32_t height; + uint32_t depth; +}; + +// Define order: .addr, .offset_t_col, .offset_t_row, .width, .height, .depth +#define TILEDATA(adrtab, next_tab_col, next_tab_row, h, w, d) \ + { \ + (uint8_t **)(adrtab), static_cast(next_tab_col), static_cast(next_tab_row), \ + static_cast(w), static_cast(h), static_cast(d) \ + } + +/*=======================================*/ +/* Auxiliary functions */ +/*=======================================*/ +#if defined(__hexagon__) +inline int32_t max_i32(int32_t a, int32_t b) +{ + return Q6_R_max_RR(a, b); +} +inline int32_t min_i32(int32_t a, int32_t b) +{ + return Q6_R_min_RR(a, b); +} +inline uint32_t max_u32(uint32_t a, uint32_t b) +{ + return Q6_R_maxu_RR(a, b); +} +inline uint32_t min_u32(uint32_t a, uint32_t b) +{ + return Q6_R_minu_RR(a, b); +} +#else +inline int32_t max_i32(int32_t a, int32_t b) +{ + return (a < b) ? b : a; +} +inline int32_t min_i32(int32_t a, int32_t b) +{ + return (a < b) ? a : b; +} +inline uint32_t max_u32(uint32_t a, uint32_t b) +{ + return (a < b) ? b : a; +} +inline uint32_t min_u32(uint32_t a, uint32_t b) +{ + return (a < b) ? a : b; +} +#endif + +[[maybe_unused]] inline ALWAYSINLINE int64_t roundf_i64(float val) +{ + // add 0.5 (with same sign as val) and then conversion to int truncates toward 0. + // values exactly halfway will round away from 0 (like roundf). + + return (int64_t)(val + copysignf(0.5f, val)); +} + +[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T roundf_i32(float val) +{ + // add 0.5 (with same sign as val) and then conversion to int truncates toward 0. + // values exactly halfway will round away from 0 (like roundf). + + return (int)(val + copysignf(0.5f, val)); +} +// same thing for rounding to unsigned range; -ve inputs will give 0. +// +[[maybe_unused]] inline ALWAYSINLINE uint32_t roundf_u32(float val) +{ + // add 0.5f and then convert to uint (trunc towards 0; -ve values are clipped to 0). +#ifdef __hexagon__ + // use intrinsic since conv of -ve float to unsigned is 'undefined behaviour' in C. + return Q6_R_convert_sf2uw_R_chop(val + 0.5f); +#else + return (val < 0.5f) ? 0 : (uint32_t)(val + 0.5f); +#endif +} + +[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T roundd_i32(double val) +{ + // add 0.5 (with same sign as val) and then conversion to int truncates toward 0. + // values exactly halfway will round away from 0 (like round). + + return (int)(val + copysign(0.5, val)); +} + +[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T saturate_u8(NN_INT32_T val) +{ +#ifdef __hexagon__ + return Q6_R_satub_R(val); +#else + return (val < 0) ? 0 : ((val > 255) ? 255 : val); +#endif +} + +[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T saturate_u16(NN_INT32_T val) +{ +#ifdef __hexagon__ + return Q6_R_satuh_R(val); +#else + return (val < 0) ? 0 : ((val > 65535) ? 65535 : val); +#endif +} + +[[maybe_unused]] static inline ALWAYSINLINE NN_INT32_T saturate_i16(NN_INT32_T val) +{ +#ifdef __hexagon__ + return Q6_R_sath_R(val); +#else + return (val < -32768) ? -32768 : ((val > 32767) ? 32767 : val); +#endif +} + +/** + * @brief low-cost frexpf (but only the exponent result); + * Generates only a few instructions on hexagon. + * + * Input must not be inf,nan, zero, or denormal. + * + * returns: + * -1 if abs(x) is in range 0.25 ... 0.249999 + * 0 if abs(x) is in range 0.5 ... 0.99999 + * 1 if abs(x) is in range 1.0 .. 1.9999 + * etc + * + * If the value -126 is returned, x is a zero or denormal; + * 129 is returned for inf or NaN. for other cases the value is the same + * as what frexpf (in math.h) generates for the exponent. + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr int flt_getexp(float x) +{ + union { + float f; + uint32_t u32; + } const uu = {x}; + return ((uu.u32 >> 23u) & 0xFFu) - 126; +} +/** + * @brief low-cost frexpf (but only the 'fraction' result); + * Generates only a few instructions on hexagon. + * + * Input must not be inf,nan, zero, or denormal. + * + * returns a value in the range [0.5, 1.0) (or in (-1.0,-0.5] when x < 0) + * such that x = flt_getmant(x) * powf2(2.0, flt_getexp(x)) + * + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_getmant(float x) +{ + union { + float f; + uint32_t u32; + } uu = {x}; + uu.u32 = (uu.u32 & 0x807fffffu) | (uint32_t(126) << 23u); // force exponent = 126 + return uu.f; +} + +/** + * @brief returns the mantissa of x, as a 24-bit number + * in the range 0x800000 .. 0xFFFFFF + * + * Input must not be inf,nan, zero, or denormal. + * + * Sign is discarded. same as powf(2,24) * flt_getmant(fabsf(x)). + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr int32_t flt_getfrac(float x) +{ + union { + float f; + uint32_t u32; + } const uu = {x}; + int32_t const m = (uu.u32 & 0x007fffffu) | (uint32_t(1) << 23u); + return m; +} + +// +// This 'normalizes' a float to 0.5 .. 0.9999 (sign is retained) +// Same result as the return value from frexpf, without using a function call +// Results are not valid if x is 0, denormal, or inf/nan +// +[[maybe_unused]] inline ALWAYSINLINE float flt_getfrac_norm(float x) +{ + union { + float f; + uint32_t u32; + } uu = {x}; + uu.u32 = (uu.u32 & 0x807fffffu) | (uint32_t(126) << 23u); // force exponent = 126 + return uu.f; +} +/** + * @brief low-cost 2.0*n for integer n. + * Same as powf(2.0f, iexpo) without a function call; + * + * Constraint: iexpo must be in range -126..127 + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_power2(uint32_t const iexpo) +{ + uint32_t const a = (iexpo + 127) & 0xFFu; + union { + uint32_t u32; + float f; + } const uu = {a << 23u}; + return uu.f; +} +/** + * @brief low-cost ldexpf + * Same as ldexpf(val, iexpo) without a function call; + * + * Constraint: iexpo must be in range -126..127 + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_ldexp(float val, int iexpo) +{ + return val * flt_power2(iexpo); +} +/** + * @brief low-cost 2.0*n for integer n. + * Same as pow(2.0d, iexpo) without a function call; + * + * Constraint: iexpo must be in range -1022..1023 + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr double double_power2(uint32_t const iexpo) +{ + uint64_t const a = (iexpo + 1023) & 0x7FFu; + union { + uint64_t u64; + double d; + } const uu = {a << 52u}; + return uu.d; +} +/** + * @brief low-cost ldexpf + * Same as ldexp(val, iexpo) without a function call; + * + * Constraint: iexpo must be in range -1022..1023 + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr double double_ldexp(double val, int iexpo) +{ + return val * double_power2(iexpo); +} + +/** + * @brief returns the exponent and mantissa of x, as a n-bit number + * + * Constraint: iexpo must be in range -126..127 + * Input must not be negative, inf,nan, zero, or denormal. + */ +template inline constexpr std::pair get_scalefactor(float x) +{ + union { + float f; + uint32_t u32; + } const uu = {x}; + + uint32_t inval = uu.u32; + uint32_t const mask = hnnx::safe_lshift(1, MBITS) - 1; + inval = hnnx::safe_rshift(inval + hnnx::safe_lshift(1, (24 - MBITS - 1)), + (24 - MBITS)); // possibly overflows into exponent, but that's OK. + uint32_t const m = ((inval & mask) | hnnx::safe_lshift(1u, (MBITS - 1))); + int32_t const e = int32_t(hnnx::safe_rshift(inval, (MBITS - 1)) & 0xFFu) - 126; + return {e, m}; +} + +/** + * @brief returns the parameters for scaling. + * bit 31-24: left shift amount + * bit 23-16: right shift amout + * bit 15- 0: scale factor + * + * Input must not be inf,nan, zero, negative or denormal. + * + */ +[[maybe_unused]] inline ALWAYSINLINE constexpr uint32_t get_scaling_params(float x, int max_sl, int max_sr) +{ + auto [e, m] = get_scalefactor<15>(x); + // Set a sl or sr amount to perform a multiply of 2^exponent by mantissa. + int sl = (e > 0) ? e : 0; + int sr = (e > 0) ? 0 : -e; + // The max_sl allows the addition of extra left shifts when working with small numbers having negative exponents. + // For every extra left shift, there is an offsetting right shift added so that the net right shift amount + // required from the exponent stays the same. The max_sr parameter provides a ceiling to the required offsetting + // right shifts, preventing the total right shift requirement from being large enough to erase data through shifting. + if (sl == 0 && sr > 0) { + sl = min_i32(max_sl, max_i32(max_sr - sr, 0)); + sr = sr + sl; + } + return ((uint32_t(sl) & 0x0FFu) << 24u) | ((uint32_t(sr) & 0x0FFu) << 16u) | uint32_t(m); +} + +/** + * @brief given a scale in float and a recip shift amount + * return a quantized scale multiplier and change recip shamt inplace + * + */ +inline uint32_t get_quantized_multipiler(const float scale_f, int &recip_shamt) +{ + recip_shamt = (scale_f <= 1.0f) ? 0 : flt_getexp(scale_f); + uint32_t scale = static_cast(roundf(flt_ldexp(scale_f, (31 - recip_shamt)))); + scale = (scale < 0x7fffffffu) ? scale : 0x7FFFFFFFu; + return scale; +} + +/** + * @brief given a scale in float and a recip shift amount + * return a quantized scale multiplier and change recip shamt inplace + * + */ +//Now with corrected spelling +inline uint32_t get_quantized_multiplier(const float scale_f, int &recip_shamt) +{ + return get_quantized_multipiler(scale_f, recip_shamt); +} +#endif /*AFUNCS_H*/ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h new file mode 100755 index 0000000000000..844bcf4c7ec50 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h @@ -0,0 +1,236 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef ALLOCATOR_H +#define ALLOCATOR_H 1 + +#include +#include +#include +#include "dtype_enum.h" +#include "weak_linkage.h" +#include "macros_attribute.h" +#include "forward_classes.h" +#include "hexagon_nn_types.h" + +enum class MemoryClass { + Plain, + TCM, + UnCached, // for spill/fill DDR + XXX_LAST_MEMORY_TYPE, + Default = Plain +}; + +PUSH_VISIBILITY(default) + +extern bool TrackedAllocError; + +class Graph; +class HexagonNNEnv; +namespace fa { +struct PoolDesc; +struct BigBuff; +struct RuntimeAllocator; +} // namespace fa +namespace hnnx { + +class Serializer; +class Deserializer; + +// some options flags (powers of 2) for calls to Tensor::allocate +enum AllocOptions { + uncached_int8 = 0x1, // override MemoryClass to UnCached. + uncached_int16 = 0x2, + uncached_fp16 = 0x4 +}; + +/* + * Maybe FIXME: It seems like FancyAllocator has just about all the same interfaces as Allocator, + * is all this pimpl stuff needed, or could we just inherit Allocator and have a unique_ptr + * in our graph? + */ + +class Allocator { + public: + // MIN_ALIGN, MAX_ALIGN: + // - both must be powers of 2 + // - 8 <= MIN_ALIGN <= MAX_ALIGN + // All allocations will be aligned to at least MIN_ALIGN, both start and end of each region. + // This includes sub-allocations in memory pools. + // Alignment requests > MAX_ALIGN may be treated as MAX_ALIGN if allocated in DDR. + // + static constexpr unsigned MIN_ALIGN = 256; + static constexpr unsigned MAX_ALIGN = 256; + + // The alignment used by TCM allocation; >= MIN_ALIGN + static constexpr unsigned TCM_ALLOC_ALIGN = 2048; + + static void *vacant() { return (void *)2; } // special value for 'vacant' slot. + enum Mode { AllocVirtual, AllocPhysical, AllocTemp, AllocTempEnd, AllocComplete, LastMode = AllocComplete }; + + // AllocTemp/AllocTempEnd are used in Virtual mode, to set a 'Temp Physical' mode + // where allocation is done to physical memory, but into memory blocks which + // are discarded when we return via AllocTempEnd (So, AllocTempEnd is not possible as an actual + // current mode). + // This is intended to support nesting (multiple levels of AllocTemp; each + // AllocTempEnd discards all allocs since the matching AllocTemp; but + // currently nesting is not supported, so AllocTemp must be followed by AllocTempEnd, + // which actually takes you back to AllocVirtual + // AllocComplete allows no further allocations. A deserialized allocator + // is in this state. + + API_EXPORT Allocator(Mode mode_in, Graph &graph_in) : graph(graph_in), mode(mode_in){}; + API_EXPORT virtual ~Allocator() = 0; + + Graph &graph; + + // Either allocates enough, or dips into a buffer (and changes the buffer pointer and size parameter accordingly). + // al is an alignment parameter; it must be a power of 2 or the code below won't work. + API_EXPORT void *tracked_aligned_alloc(size_t al, size_t bytes, fa::BigBuff *const bb = nullptr); + API_EXPORT void tracked_free(void *aligned_ptr) noexcept; + + API_EXPORT virtual void allocate_n(void **arrp, size_t n, size_t block_size, size_t alignment, MemoryClass memclass, + unsigned options, DType dtype); + + // options for allocate_persistent_blocks. + // if 'allnew' is *not* present, it is assumed that all of the pointers + // are either null, or point to existing persistent blocks. The 'null' ones + // are replaced with new allocations, and the ref counts are increased in both cases. + // with 'allnew': pointers are assumed to contain garbage. Equivalent to zeroing the + // pointer table first. + // + // zoneB: with this, ref counts are update in 'B' zone instead of A + // + // incref: ovverides 'allnew'; all of the existing pointers are required to be valid persistent + // blocks; the ref counts are increased by 1 + // decref: overrides 'incref and allnew'; all of the pointers are required to be valid persistent + // blocks; the ref counts are reduced by 1. If total refs are zero, block is freed. + // the pointer table is not updated. + // + // infinite: newly alloc'd blocks get refcount set to a huge number, instead of 1. + // Currently this is used when deserializing, since we can't free things immediately when in Crate. + // + enum persistent_options { + allnew = 1u, // assume existing pointers are garbage, allocate them all. + zoneB = 2u, // reference count in zone B instead of A. + incref = 4u, // enforce that all existing are persistnent; incref them. + decref = 8u, + infinite = 16u, // refcounts on new blocks, set to a huge # instead of 1. + }; + + // allocate n 'persistent' blocks of the given size/alignment, and update the table. + API_EXPORT virtual void allocate_persistent_blocks(void **table, size_t nblocks, size_t block_size, + size_t alignment, unsigned options); + + API_EXPORT inline void *allocate(const void *oldval, size_t block_size, size_t alignment, MemoryClass memclass, + unsigned options, DType dtype) + { + PUSH_WARNING() + DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV) + void *tmp = const_cast(oldval); + POP_WARNING() + allocate_n(&tmp, 1, block_size, alignment, memclass, options, dtype); + return tmp; + } + + API_EXPORT Mode get_mode() const { return mode; } + API_EXPORT virtual void set_mode(Mode new_mode); + + API_EXPORT virtual void set_tcm_pool(void *base, size_t size); + + API_EXPORT virtual void set_largest_memory_alloc_size(size_t size); + + /* + * Serialize all the internal data for the allocator. + * Memory regions / pools, etc. + */ + API_EXPORT virtual void serialize(Serializer &) const; + /* + * Deserialize the allocator, restore internal data from buffer. + */ + API_EXPORT virtual void deserialize(HexagonNNEnv &env, Deserializer &dctx, + hexagon_nn_wide_address_const_t params_weights = 0U, + const size_t params_weights_length = 0, + hexagon_nn_wide_iovec_t const &weights = NULL_IOVEC); + + API_EXPORT virtual int find_replaceable_mempool(unsigned const replaceable_pool_seq, + fa::PoolDesc &found_pool) const; + + // LCOV_EXCL_START [SAFTYSWCCB-1542] + API_EXPORT static inline constexpr size_t fixup_alignment(size_t align) + { + static_assert(MIN_ALIGN >= 8 && (MIN_ALIGN & (MIN_ALIGN - 1)) == 0, "bad MIN_ALIGN"); + static_assert(MAX_ALIGN >= MIN_ALIGN && (MAX_ALIGN & (MAX_ALIGN - 1)) == 0, "bad MAX_ALIGN"); + if (MIN_ALIGN < MAX_ALIGN) { + return std::max(MIN_ALIGN, std::min(MAX_ALIGN, align)); + } else { + return MIN_ALIGN; + } + } + // LCOV_EXCL_STOP + + API_EXPORT static inline constexpr size_t round_up_align(size_t n, size_t align) + { + return (n + (align - 1)) & ~(align - 1); + } + template API_EXPORT static inline T *round_up_align(T *p, size_t align) + { + return (T *)round_up_align((size_t)p, align); + } + + protected: + Mode mode = AllocVirtual; +}; + +// +// this is s 'shim' class to help in making dummy allocators. It defines overrides +// for all of the pure-virtual methods, so you don't need to +// +class FakeAllocator : public Allocator { + public: + API_EXPORT FakeAllocator(Allocator::Mode mode_in, Graph &graph_in) : Allocator(mode_in, graph_in){}; + API_EXPORT virtual ~FakeAllocator(); +}; + +// this is an accessor which is used by the Dma 'Fill' operation +// to get a source pointer for reading const, based on (pool_id, offset). +// It also holds the base pointer for ddr spill area. +// Maybe other things could be added later. + +class MemPoolRunTimeAccessor { + hexagon_nn_wide_address_t spill_area; + fa::PoolDesc const *pool_table; // pool_table[0] is for poolid=1 + unsigned max_pool_id; + + public: + API_EXPORT MemPoolRunTimeAccessor(hexagon_nn_wide_address_const_t spill_area_in, fa::PoolDesc const *const pt, + unsigned const pt_size) + : spill_area(spill_area_in), pool_table(pt), max_pool_id(pt_size) + { + } + API_EXPORT MemPoolRunTimeAccessor() : spill_area(0), pool_table(nullptr), max_pool_id(0) {} + API_EXPORT MemPoolRunTimeAccessor(MemPoolRunTimeAccessor const &) = default; + API_EXPORT MemPoolRunTimeAccessor &operator=(MemPoolRunTimeAccessor const &) = default; + + // pool ids are >= 1, <= num_pools + API_EXPORT constexpr unsigned num_pools() const { return max_pool_id; } //LCOV_EXCL_LINE [SAFTYSWCCB-1542] + // map pool_id to base address of the data, for persistent pool; also get 'is_weights' flag. + // implementation in runtime_alloc.h + std::pair get_persistent_pool_base_iswts(unsigned pool_id) const; + API_EXPORT hexagon_nn_wide_address_t get_spill_area() const { return spill_area; } + + // used to construct the ConstExtentDescriptor during prep + // implementation in fa_alloc.h + API_EXPORT fa::PoolDesc const *get_descriptor(unsigned pool_id) const; +}; + +} // namespace hnnx + +POP_VISIBILITY() + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h new file mode 100755 index 0000000000000..11d01bcb31b95 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h @@ -0,0 +1,244 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef BAKE_DEFS +#define BAKE_DEFS 1 +#include +#include +#include +#include + +#include "executable.h" + +// Contains defs for host-side and target side, so try not +// to add too many 'host only' things. + +#ifdef __hexagon__ +#define HNNX_ARCH_CAN_RUN_BAKED 1 +#endif + +namespace hnnx { + +namespace bake { + +using tgt_ptr_word = unsigned; +using tgt_sizet_word = unsigned; +static constexpr unsigned tgt_ptr_bytes = sizeof(tgt_ptr_word); +static constexpr unsigned tgt_sizet_bytes = sizeof(tgt_sizet_word); +static constexpr bool op_has_graphp = false; +static constexpr unsigned tensor_uptr_ptrs = 2; +static constexpr unsigned max_opaquet_align = 1024; // must be power of 2 + +// This should be OK as a first approx: includes hexagon and x86-32 +static constexpr bool host_can_run_baked = sizeof(void *) == tgt_ptr_bytes; + +inline unsigned constexpr round_up(unsigned x, unsigned m) +{ + return ((x + (m - 1)) / m) * m; +} + +// functions to calculate size, align of various things. They +// are included in target build so we can static_assert that sizes are what we think they are. +// (all must be constexpr). + +// {size, alignment} of typical_op +inline constexpr std::pair typical_op_tgt_size_align(unsigned n_in, unsigned n_out) +{ + // 1 pointer per input, plus tensor_uptr_ptrs per output; but if n_in = n_out == 0, it's 1 pointer. + // (for a 'fill' byte). + unsigned num_io_ptrs = n_in + n_out * tensor_uptr_ptrs; + if (num_io_ptrs == 0) num_io_ptrs = 1; // n_in = n_out = 0 case + return {tgt_ptr_bytes * ((op_has_graphp ? 2 : 1) // vptr, and maybe Graph * + + num_io_ptrs), // inputs and outputs + tgt_ptr_bytes}; // align +} + +// 'tensor_op_tgt_size_align is used for crate accounting of ShapeWrapperOp, ConstWrapperOp, DummyOp +// In a proper 'baked graph' we don't need to insert these, just the tensors... + +inline constexpr std::pair tensor_op_tgt_size_align(unsigned n_out) +{ + // happens to be the same as TypicalOp with no inputs... + return typical_op_tgt_size_align(0, n_out); +} + +// {size, alignment, extra} of typical_op_with_compiler +// extra_len is the len of the extra data +// extra_align is its alignment. +// The 3rd return value is the offset of the 'extra' within the image. +// +inline constexpr std::tuple +typical_op_extra_tgt_size_align(unsigned n_in, unsigned n_out, unsigned extra_len, unsigned extra_align) +{ + std::pair base_size = typical_op_tgt_size_align(n_in, n_out); + unsigned extra_offs = base_size.first; + if (extra_len > 0) { + extra_align = std::max(extra_align, base_size.second); + extra_len = round_up(extra_len, extra_align); + extra_offs = round_up(extra_offs, extra_align); + base_size.first = extra_offs + extra_len; + base_size.second = extra_align; + } + return {base_size.first, base_size.second, extra_offs}; +} + +// {size, alignment} of variadic op (without the in, out array contents)! +constexpr std::pair variadic_op_tgt_size_align(unsigned n_in, unsigned n_out) +{ + const unsigned cratevec_words = 2; + return {tgt_ptr_bytes * (1 // vptr + + (op_has_graphp ? 1 : 0) // Graph * + + 2 * cratevec_words), // two cratevecs + tgt_ptr_bytes}; // align +} +// {size, alignment} of simple_op_wrapper (without the in, out array contents)! +constexpr std::pair simplewrap_op_tgt_size_align(unsigned n_in, unsigned n_out) +{ + // this is just one more pointer than a variadic op... + const auto var_result = variadic_op_tgt_size_align(n_in, n_out); + return {var_result.first + tgt_ptr_bytes, var_result.second}; +} + +// {size, alignment} of a ChunkPreloadOp +constexpr std::pair chunk_preload_op_tgt_size_align() +{ + return {tgt_ptr_bytes * (1 // vptr + + (op_has_graphp ? 1 : 0) // Graph * + + 2), // ptr, len; + tgt_ptr_bytes}; // align +} + +// +// {size_align} of Shape object +// +constexpr std::pair shape_tgt_size_align(unsigned rank) +{ + // tgt_sizet_bytes * (1 + 1 + 2 * rank) = + // vtable ptr + // shapeflag flags + padding[] + // std::array dims + // std::array max_dims + // + rank = std::array pad + return {round_up(tgt_sizet_bytes * (1 + 1 + 1 + 2 * rank) + rank, tgt_sizet_bytes), tgt_sizet_bytes}; +} + +// +// {size_align} of DynamicShape object +// +constexpr std::pair dynamic_shape_tgt_size_align(const unsigned rank) +{ + // std::array dims == tgt_sizet_bytes * rank + // (shapeflag flags + padding[]) + vtable ptr + dynamic_state = (3 * tgt_sizet_bytes) + return {round_up(tgt_sizet_bytes * rank + (4 * tgt_sizet_bytes), tgt_sizet_bytes), tgt_sizet_bytes}; +} + +// +// {size_align} of interface object (may or may not be quantized) +// +constexpr std::pair interface_tgt_size_align(bool is_quantized) +{ + return {tgt_sizet_bytes + (is_quantized ? round_up(3 * 4, tgt_sizet_bytes) : 0), tgt_sizet_bytes}; +} + +// {size_align} of Tensors, of three different forms: +// +// 'general' tensor +// +constexpr std::pair tensor_general_tgt_size_align() +{ + return {tgt_sizet_bytes * 4 + 2 * tgt_ptr_bytes, tgt_sizet_bytes}; +} + +// 'shape' tensor, of given rank. +// +constexpr std::pair tensor_shape_tgt_size_align(unsigned rank) +{ + return {tgt_sizet_bytes * ((rank == 0 ? 1 : rank) + 1), tgt_sizet_bytes}; +} + +// 'scalar' tensor, need to know if the interface is 'quantized' or not +// Note, this assumes all value are <= size_t bytes. +// +constexpr std::pair tensor_scalar_tgt_size_align(bool is_quantized) +{ + const unsigned ifc_size = interface_tgt_size_align(is_quantized).first; + return {tgt_sizet_bytes * 2 + ifc_size, tgt_sizet_bytes}; +} +// sizeof OpExtraInfo on target: {long long, 2 * unsigned, char *, 4 * padbyte} +constexpr std::pair OpExtraInfo_size_align = {24, 8}; + +// The size of a SliceDispatchOp for the given number of slices. +// Currently it's always the same regardless of 'nslices'; We may introduce 'right-sized' +// value, in which case 'exact=true' will get the 'real' size; but exact = false will always +// give the full size. +constexpr std::pair slice_dispatch_op_size_align(unsigned const nslices, bool const exact = false) +{ + return {tgt_sizet_bytes * ((op_has_graphp ? 5 : 4) + 3 * Executable::MAX_OP_SLICES), tgt_sizet_bytes}; +} + +// The size of a Predicated Op +constexpr std::pair pred_op_size_align() +{ + return {tgt_sizet_bytes * ((op_has_graphp ? 5 : 4) + 3), tgt_sizet_bytes}; +} + +// this is used in e.g. +// if constexpr(host_can_run_baked) static_assert(size_align_matches(N_IN, N_OUT)); + +template constexpr bool size_align_matches(SZAL sz) +{ + return sizeof(T) == std::get<0>(sz) && alignof(T) == std::get<1>(sz); +} + +// This is a utility to check that a type T has a given size and aligment, using static_assert; +// Just need to include a call to 'do-nothing' bake::check_size_align::template check(); +// The static assert is *disabled* unless compiling on hexagon (or compatible host). +// +// It's more complex than it needs to be, since it's designed to make sure the type and +// numbers wind up in the error message, e.g. you could end up with +// error: static_assert failed due to requirement 'claimed(40) == actual(48)' "size not as claimed" +// static_assert(claimed(CLAIMED_SIZE) == actual(ACTUAL_SIZE), "size not as claimed"); +// ... note: in instantiation of function template specialization 'check_szal::check_size_align<..., ...>' +// +template struct check_size_align { + static constexpr int claimed(int K) { return K; } + static constexpr int actual(int K) { return K; } + template static constexpr bool check_size() + { + static_assert(claimed(CLAIMED_SIZE) == actual(ACTUAL_SIZE), "size not as claimed"); + return CLAIMED_SIZE == ACTUAL_SIZE; + } + template static constexpr bool check_align() + { + static_assert(claimed(CLAIMED_ALIGN) == actual(ACTUAL_ALIGN), "align not as claimed"); + return CLAIMED_ALIGN == ACTUAL_ALIGN; + } + + template static constexpr bool check() + { + bool result = true; + if constexpr (host_can_run_baked) { + result = check_size() && check_align(); + } + return result; + } +}; + +} // namespace bake + +// +// op_opaque_tgt_info must be specialized for each OpaqueT used in TypicalOpWithCompiler +// +template struct op_opaque_tgt_info { + // static constexpr unsigned length = ..; // length of the struct on target CPU + // static constexpr unsigned alignment = ... // aligbment on target CPU +}; + +} // namespace hnnx + +#endif // BAKE_DEFS diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h new file mode 100755 index 0000000000000..3496b792f25aa --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h @@ -0,0 +1,247 @@ +//============================================================================== +// +// Copyright (c) 2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +// Compiler builtin intrinsic functions should be specified in this file + +#ifndef BUILTIN_INTRINSICS_H_ +#define BUILTIN_INTRINSICS_H_ + +#include +#include +#include +#include + +// Branch prediction +#if defined(__clang__) + +#define HEX_LIKELY(x) __builtin_expect(!!(x), 1) +#define HEX_UNLIKELY(x) __builtin_expect(!!(x), 0) + +#define HEX_ASSUME __builtin_assume +#define HEX_UNREACHABLE __builtin_unreachable + +#elif defined(_MSC_VER) + +#define HEX_LIKELY(x) (x) +#define HEX_UNLIKELY(x) (x) + +#define HEX_ASSUME __assume +#define HEX_UNREACHABLE() __assume(0) + +#elif defined(__GNUC__) +//No equivalent __builtin_assume in GNUC. Hence leaving empty. +#define HEX_ASSUME(cond) + +#define HEX_LIKELY(x) __builtin_expect(!!(x), 1) +#define HEX_UNLIKELY(x) __builtin_expect(!!(x), 0) +#define HEX_UNREACHABLE __builtin_unreachable + +#endif // defined(__clang__) + +// Overflow detection +#if defined(__clang__) || defined(__GNUC__) + +#define HEX_ADD_OVERFLOW __builtin_add_overflow +#define HEX_MUL_OVERFLOW __builtin_mul_overflow + +#elif defined(_MSC_VER) + +#include + +template static inline bool HEX_ADD_OVERFLOW(_T a, _T b, _T *out) +{ + *out = a + b; + return ((b > 0) && (a > std::numeric_limits<_T>::max() - b)) || + ((b < 0) && (a < std::numeric_limits<_T>::min() - b)); +} + +template static inline bool HEX_MUL_OVERFLOW(_T a, _T b, _T *out) +{ + *out = a * b; + return ((b > 0) && (a > std::numeric_limits<_T>::max() / b || a < std::numeric_limits<_T>::min() / b)) || + ((b < 0) && (a > std::numeric_limits<_T>::min() / b || a < std::numeric_limits<_T>::max() / b)); +} + +#endif // __clang__ + +// Count bits + +#include + +template static inline int HEX_COUNT_ONE_BIT(_T x) +{ + return std::bitset(x).count(); +} + +#define HEX_COUNT_ONE_BIT_ULL HEX_COUNT_ONE_BIT +#define HEX_COUNT_ONE_BIT_UL HEX_COUNT_ONE_BIT + +#if defined(__clang__) || defined(__GNUC__) + +#define HEX_COUNT_LEADING_ZERO __builtin_clz +#define HEX_COUNT_LEADING_ZERO_UL __builtin_clzl +#define HEX_COUNT_LEADING_ZERO_ULL __builtin_clzll + +#define HEX_COUNT_TRAILING_ZERO __builtin_ctz +#define HEX_COUNT_TRAILING_ZERO_UL __builtin_ctzl +#define HEX_COUNT_TRAILING_ZERO_ULL __builtin_ctzll + +#elif defined(_MSC_VER) + +#include + +// Returns the number of leading 0-bits in x, starting at the most significant +// bit position. If x is 0, the result is undefined. +static inline int HEX_COUNT_LEADING_ZERO_ULL(unsigned long long x) +{ + unsigned long where; + if (_BitScanReverse64(&where, x)) return static_cast(63 - where); + return 64; // Undefined behavior +} + +static inline int HEX_COUNT_LEADING_ZERO(unsigned int x) +{ + unsigned long where; + if (_BitScanReverse(&where, x)) return static_cast(31 - where); + return 32; // Undefined Behavior. +} + +static inline int HEX_COUNT_LEADING_ZERO_UL(unsigned long x) +{ + return sizeof(x) == 8 ? HEX_COUNT_LEADING_ZERO_ULL(x) : HEX_COUNT_LEADING_ZERO(static_cast(x)); +} + +// Returns the number of trailing 0-bits in x, starting at the least significant +// bit position. If x is 0, the result is undefined. +static inline int HEX_COUNT_TRAILING_ZERO_ULL(unsigned long long x) +{ + unsigned long where; + if (_BitScanForward64(&where, x)) return static_cast(where); + return 64; // Undefined Behavior. +} + +static inline int HEX_COUNT_TRAILING_ZERO(unsigned int x) +{ + unsigned long where; + if (_BitScanForward(&where, x)) return static_cast(where); + return 32; // Undefined Behavior. +} + +static inline int HEX_COUNT_TRAILING_ZERO_UL(unsigned long x) +{ + return sizeof(x) == 8 ? HEX_COUNT_TRAILING_ZERO_ULL(x) : HEX_COUNT_TRAILING_ZERO(static_cast(x)); +} + +#endif // defined(__clang__) + +// Atomic operation + +#if defined(__clang__) || defined(__GNUC__) + +#define HEX_ATOMIC_FETCH_AND_ADD __sync_fetch_and_add + +#define HEX_ATOMIC_FETCH_AND_AND __sync_fetch_and_and +#define HEX_ATOMIC_FETCH_AND_OR __sync_fetch_and_or + +#define HEX_ATOMIC_VAL_COMPARE_AND_SWAP __sync_val_compare_and_swap +#define HEX_ATOMIC_BOOL_COMPARE_AND_SWAP __sync_bool_compare_and_swap + +#elif defined(_MSC_VER) + +#include + +#define HEX_ATOMIC_FETCH_AND_ADD(_p, _v) \ + (sizeof *(_p) == sizeof(__int64) ? _InterlockedExchangeAdd64((__int64 *)(_p), (__int64)(_v)) \ + : _InterlockedExchangeAdd((long *)(_p), (long)(_v))) + +template static inline _T HEX_ATOMIC_FETCH_AND_AND(_T volatile *_p, _T _v) +{ + _InterlockedAnd((long *)_p, (long)_v); + return static_cast<_T>(*_p); +} + +template static inline _T HEX_ATOMIC_FETCH_AND_OR(_T volatile *_p, _T _v) +{ + _InterlockedOr((long *)_p, (long)_v); + return static_cast<_T>(*_p); +} + +#define HEX_ATOMIC_VAL_COMPARE_AND_SWAP(_p, _old, _new) \ + (sizeof *(_p) == sizeof(__int64) \ + ? _InterlockedCompareExchange64((__int64 *)(_p), (__int64)(_new), (__int64)(_old)) \ + : _InterlockedCompareExchange((long *)(_p), (long)(_new), (long)(_old))) + +#define HEX_ATOMIC_BOOL_COMPARE_AND_SWAP(_p, _old, _new) (HEX_ATOMIC_VAL_COMPARE_AND_SWAP(_p, _old, _new) == (_old)) + +#endif // defined(__clang__) + +namespace hnnx { + +/** + * @brief promote_shift_operand reflects the integral promotions for small integer types. + * safe_lshift/safe_rshift must be aware of these promotions, since the C++ standard only + * defines the behavior for shift operations where the RHS is between 0 and + * 1 less than the bit-width of the *promoted* type of the LHS. + */ +template struct promote_shift_operand { + typedef T type; +}; + +template <> struct promote_shift_operand { + using type = int; +}; +template <> struct promote_shift_operand { + using type = int; +}; +template <> struct promote_shift_operand { + using type = int; +}; +template <> struct promote_shift_operand { + using type = int; +}; +template <> struct promote_shift_operand { + using type = int; +}; + +template using promote_shift_operand_t = typename promote_shift_operand::type; + +// The following portable template functions are replacements for the +// built-in shift operations, << and >>, that provide the following guarantees: +// +// 1. Both the left and right operands of the shift will be treated as unsigned. +// This, by construction, prevents any undefined or implementation-defined +// behavior that may arise when shifting negative-valued expressions. +// 2. The right operand will be bit-masked in a way that guarantees +// that its value is in the range [0, bitwidth(promoted_left_operand) - 1] + +template constexpr unsigned get_safe_shift_mask() +{ + return unsigned(CHAR_BIT * sizeof(promote_shift_operand_t>>) - 1); +} + +template ()> +constexpr auto safe_lshift(T const value, S const shift_amount) +{ + static_assert(std::is_integral::value && std::is_integral::value, + "safe_lshift only makes sense for integral parameters"); + assert((static_cast(shift_amount) & ~mask) == 0 && "shift_amount is out of range"); + return value << shift_amount; +} + +template ()> +constexpr auto safe_rshift(T const value, S const shift_amount) +{ + static_assert(std::is_integral::value && std::is_integral::value, + "safe_rshift only makes sense for integral parameters"); + assert((static_cast(shift_amount) & ~mask) == 0 && "shift_amount is out of range"); + return value >> shift_amount; +} + +} // namespace hnnx + +#endif /* BUILTIN_INTRINSICS_H_ */ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h new file mode 100755 index 0000000000000..0531625039312 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h @@ -0,0 +1,21 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef C_TRICKS_H +#define C_TRICKS_H 1 + +#define CTRICKS_PASTER2(A, B) A##B +#define CTRICKS_PASTER(A, B) CTRICKS_PASTER2(A, B) + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + +#define PROBABLY(x) __builtin_expect(!(!(x)), 1) +#define YEAHRIGHT(x) __builtin_expect(!(!(x)), 1) + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h new file mode 100755 index 0000000000000..c4363d8cb3e6f --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h @@ -0,0 +1,26 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CC_PP_H +#define CC_PP_H 1 + +/* + * C++ Preprocessor Definitions + */ + +#ifdef __cplusplus +#define EXTERN_C_BEGIN extern "C" { +#define EXTERN_C_END \ + } \ + ; +#else +#define EXTERN_C_BEGIN /* NOTHING */ +#define EXTERN_C_END /* NOTHING */ +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h new file mode 100755 index 0000000000000..bd12354b0a314 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h @@ -0,0 +1,35 @@ +//============================================================================== +// +// Copyright (c) 2022-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#include "cc_pp.h" +#include "macros_attribute.h" +#include "weak_linkage.h" + +#ifndef CHECK_HVX_H +#define CHECK_HVX_H 1 + +// +// This makes sure that we have an HVX context (or not). Does nothing on H2 or +// QuRT, but on x86, makes use of a TLS variable to do the check. +// + +#ifdef __hexagon__ + +static inline void check_hvx() {} +static inline void check_not_hvx() {} + +#else + +PUSH_VISIBILITY(default) +API_EXPORT void check_hvx(); +API_EXPORT void check_not_hvx(); +POP_VISIBILITY() + +#endif + +#endif // CHECK_HVX_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h new file mode 100755 index 0000000000000..a7f50569eb471 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h @@ -0,0 +1,207 @@ +//============================================================================== +// +// Copyright (c) 2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CONST_EXTENT_DESCRIPTOR_H +#define CONST_EXTENT_DESCRIPTOR_H 1 + +#include +#include +#include +#include +#include "forward_classes.h" +#include "serialize_defs.h" +#include "pickle_header_tags.h" +#include "const_extent_shared.h" + +namespace hnnx { + +// This class is used, on both encoder and decoder, to contain a 'const extent descriptor' in its raw form, (just an array of uint32) +// and provide higher-level access to the contents. + +class ConstExtentDesc { + protected: + using table_t = std::vector; + // The 'table' may or may not contain the 'padding' section at the end; this is not accessed, + // and the serialize method will always generate the required padding. + table_t table; + // some values broken out from the header... + unsigned extab_n = 0, extab_idx = 0; // number of extents, and word index where they start + unsigned mptab_n = 0, mptab_idx = 0; // number of memory pools, and word index where they start. + unsigned desc_len = 0; // length of the entire descriptor in bytes (0 if invalid descriptor) + + bool scan_table(); // sanity check, and unpacks the above; returns true if OK. + + public: + static uint8_t constexpr EXTENT_FLAGS_BITFIELD_LSB = 8; + static uint8_t constexpr EXTENT_FLAGS_BITFIELD_WIDTH = 8; + + /// + /// @brief Values for 8b flags in extent record + /// + static uint8_t constexpr EXTENT_FLAG_RESERVED_0 = (1 << 0); + static uint8_t constexpr EXTENT_FLAG_RESERVED_1 = (1 << 1); + static uint8_t constexpr EXTENT_FLAG_RESERVED_2 = (1 << 2); + static uint8_t constexpr EXTENT_FLAG_RESERVED_3 = (1 << 3); + static uint8_t constexpr EXTENT_FLAG_IS_FAR_HINT = (1 << 4); ///< Contents maybe far + static uint8_t constexpr EXTENT_FLAG_RESERVED_5 = (1 << 5); + static uint8_t constexpr EXTENT_FLAG_RESERVED_6 = (1 << 6); + static uint8_t constexpr EXTENT_FLAG_RESERVED_7 = (1 << 7); + + // Return from 'extent_info'. + struct extab_entry { + uint32_t extent_flags; + uint32_t align; // a power of 2, >= 64 + uint64_t offset; // offset, in bytes, from the start of the descriptor, to where the data is. + uint64_t length; // length of the data in bytes. + }; + // Return from 'mempool_info'. + // Note: if 'adjust_offset' is true, the 'offset' field from the containing extent will be added to offset, + // so that the offset is from the start of the descriptor, instead of the start of the containing extent. + struct mempool_entry { + uint32_t mempool_id; // a mempool id >=2 indicating a const mempool + uint32_t extent_id; // an extent_id, >=1 + uint64_t offset; // offset in bytes of the data from the start of the extent (see note above) + uint64_t length; // length in bytes of the data + }; + // optional name of the const_extent this descriptor corresponds to. Used for matching in weight_sharing. + std::string name = std::string{}; + + ConstExtentDesc() {} + ConstExtentDesc(table_t &&table_in); + void serialize(Serializer &) const; + inline bool load_table(table_t &&table_in) + { + table = std::move(table_in); + return scan_table(); + } + + constexpr bool is_valid() const { return desc_len != 0; } + + constexpr unsigned descriptor_length() const { return desc_len; } + + constexpr unsigned num_extents() const { return extab_n; } + constexpr unsigned num_mempools() const { return mptab_n; } + + // unpack a row of the extent table + // NOTE: extent_id is 1-based, must be 1 .. num_extents() + extab_entry extent_info(unsigned extent_id) const; + + // unpack a row of the mempool table. + // note: idx is not a mempool idx, it is a 1-based row in range 1...num_mempools(); + // if adjust_offset, the offset of the containing extent is added to the offset + // of the mempool in the returned value. + mempool_entry mempool_info(unsigned idx, bool adjust_offset = false) const; + + // The ordering of the data and the descriptors is such that: + // + // (1) extent_info(1).offset >= descriptor_length() + // mempool_info(1,true).offset >= descriptor_length() + // (2) for i >=2, + // extent_info(i).offset >= extent_info(i+1).offset + extent_info(i+1).length + // mempool_info(i,true).offset >= mempool_info(1-1,true).offset + mempool_info(1-1).length + // + +#if !defined(PREPARE_DISABLED) + /// + /// @brief Memory pool record iterator + /// @details Use to iterator over records in memory pool table in constant + /// extent descriptor + /// + class mempool_iterator { + public: + using iterator_category = std::input_iterator_tag; + using value_type = ConstExtentDesc::mempool_entry; + using difference_type = std::ptrdiff_t; + using pointer = value_type *; + using reference = value_type &; + + /// + /// @brief Constructor + /// @param [in] cedesc A valid constant extent descriptor instance + /// @param [in] index Record index (zero-based!) + /// + explicit mempool_iterator(ConstExtentDesc const &cedesc, uint32_t index) : _cedesc(cedesc), _index(index) {} + + /// + /// @brief Increment record + /// @return Iterator + /// + mempool_iterator &operator++() + { + // Increment IFF valid constant extent descriptor and mempool record + // index within range + _index += (_cedesc.is_valid() && (_index < _cedesc.mptab_n)) ? 1 : 0; + return *this; + } + + /// + /// @brief Equality operator + /// @return true if iterators are equal + /// + bool operator==(mempool_iterator const &other) const { return _index == other._index; } + + /// + /// @brief Inequality operator + /// @return true if iterators are not equal + /// + bool operator!=(mempool_iterator const &other) const { return !(*this == other); } + + /// + /// @brief Dereference iterator + /// + reference operator*(); + + private: + /// + /// @brief Reference to a constant extent descriptor instance + /// @details It contains the blob representing constant extent segment + /// + ConstExtentDesc const &_cedesc; + + /// + /// @brief Current index + /// + uint32_t _index; + + /// + /// @brief Mempool record entry + /// @details It is assigned when on iterator dereference + /// + value_type _entry; + }; + + /// + /// @brief Return mempool iterator initialized to the first record + /// @return Mempool iterator + /// + mempool_iterator begin() { return mempool_iterator(*this, 0); } + + /// + /// @brief Return mempool iterator beyond the last record + /// @warning Intended to be used as a sentinel + /// @return Mempool iterator + /// + mempool_iterator end() { return mempool_iterator(*this, mptab_n); } +#endif +}; +#ifndef PREPARE_DISABLED +// Called at the end of serializing a graph, if 'const extent' mode is enabled. +// See comment in const_extent_descriptor.cc for full details. +// LCOV_EXCL_START [SAFTYSWCCB-1542] +size_t write_aligned_const_info(Graph const &gr, Serializer &sctx, unsigned buried_aux_n_words = 0); +#else +inline constexpr size_t write_aligned_const_info(Graph const &gr, Serializer const &sctx, unsigned = 0) +{ + return 0; +} +// LCOV_EXCL_STOP +#endif + +} // namespace hnnx + +#endif // CONST_EXTENT_DESCRIPTOR_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h new file mode 100755 index 0000000000000..39c95e26ed561 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h @@ -0,0 +1,81 @@ +//============================================================================== +// +// Copyright (c) 2024 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CONST_EXTENT_SHARED_H_ +#define CONST_EXTENT_SHARED_H_ + +namespace hnnx { +// definitions pertaining to the 'const extent descriptor'. + +constexpr unsigned CONST_EXTENT_DESC_MAGIC = 0x71c43c9b; +// if a const extent descriptor has a 'cbname' in it, the last 32-bit slot +// is this value. The 0x3e, 0x00 is the ">\0" at the end of the cbname +constexpr unsigned CONST_EXTENT_CBNAME_TAG = 0xebbe003e; + +// This must be a power of 2, and >= 64. +// This is effectively a 'quiet' minimum on options.serialize_const_alignment, which sets +// the actual alignment. +// It is not necessary for the decoder to know what value of alignment was used in the encoder. +constexpr unsigned CONST_EXTENT_MIN_ALIGN = 256; +// +// this is a (non-quiet) maximum on options.serialize_const_alignment +constexpr unsigned CONST_EXTENT_MAX_ALIGN = 1024 * 1024; + +/// +/// @brief Size of const extent descriptor header +/// +constexpr unsigned CONST_EXTENT_HEADER_SIZE_WORDS = 4u; +constexpr unsigned CONST_EXTENT_HEADER_SIZE_BYTES = CONST_EXTENT_HEADER_SIZE_WORDS * 4u; + +/// +/// @brief Size of an extent record +/// @details Const extent descriptor contains a table of such records +/// +constexpr unsigned CONST_EXTENT_RECORD_SIZE_WORDS = 4u; +constexpr unsigned CONST_EXTENT_RECORD_SIZE_BYTES = CONST_EXTENT_RECORD_SIZE_WORDS * 4u; + +/// +/// @brief Offset of extent record table relative to const extent descriptor +/// @details Both byte and words offsets are listed +/// +constexpr unsigned CONST_EXTENT_RECORD_TAB_OFFSET_WORDS = 4u; +constexpr unsigned CONST_EXTENT_RECORD_TAB_OFFSET_BYTES = CONST_EXTENT_RECORD_TAB_OFFSET_WORDS * 4u; + +/// +/// @brief Size of mempool record in a const extent descriptor +/// @details Both byte and word sizes are provided +/// +constexpr unsigned CONST_EXTENT_MEMPOOL_RECORD_SIZE_WORDS = 4u; +constexpr unsigned CONST_EXTENT_MEMPOOL_RECORD_SIZE_BYTES = CONST_EXTENT_MEMPOOL_RECORD_SIZE_WORDS * 4u; + +// This function is used by deserializer to help it extract the extent-desc table (as a vector) from some +// arbitrary point down the pickle. Parameter is a pointer to the first 4 words; the return value is +// 0 if the first two words do not look like CEDesc header; +// n otherwise (where 'n' is the number of 32-bit words to extract). +// +inline unsigned const_extent_hdr_check(uint32_t const *const hdrp) +{ + if (hdrp[0] != CONST_EXTENT_DESC_MAGIC) return 0; + const unsigned word0 = hdrp[1]; + const unsigned hdr_len16 = word0 >> 24u; // units of 16 bytes + const unsigned desc_len64 = word0 & 0xFFFFFFu; // units of 64 bytes + const unsigned n_extent = hdrp[2] & 0xFFFFFFu; + const unsigned n_mempool = hdrp[3] & 0xFFFFFFu; + // no. of words actually needed + const unsigned desc_words = 4 * (hdr_len16 + n_extent + n_mempool); + + // note, n_extent == n_mempool == 0 is allowed. + if (hdr_len16 == 0 || desc_len64 == 0 || n_extent > n_mempool || desc_words > desc_len64 * 16) { + return -1; + } + return desc_words; +} + +} // namespace hnnx + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h new file mode 100755 index 0000000000000..b30f7b8f5c871 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h @@ -0,0 +1,121 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CONSTRAINTS_H +#define CONSTRAINTS_H + +#include "interface_defs.h" +#include "op_def.h" + +#include +#include + +namespace constraint_lib { + +/** \defgroup OptConstraint Constraint Expressions for Optimization Rules + * \ingroup OptimizationFuncs + * + * @{ + */ +//! Find the chunksize of a given tensor type in a given dimension (a constant). +/// For instance, LAYOUT_CHUNKSIZE(QUint8CroutonTensor,3) gives size_t(32) +/// +#define LAYOUT_CHUNKSIZE(TYPENAME, IDX) (TYPENAME::layout.ChunkSizes[(IDX)]) + +// some convenience wrappers... + +//! IS_FLOAT16("operand") -> bool (true if operand has Float16 output) +#define IS_FLOAT16(X) EQ(DTYPE_OF(X), DType::Float16) + +//! IS_FLOAT32("operand") -> bool (true if operand has float output) +#define IS_FLOAT32(X) EQ(DTYPE_OF(X), DType::Float32) + +//! IS_FLOAT("operand") -> bool (alias of IS_FLOAT32) +#define IS_FLOAT(X) IS_FLOAT32(X) + +//! IS_QUINT8("operand") -> bool (true if operand has 'QUInt8' output) +#define IS_QUINT8(X) EQ(DTYPE_OF(X), DType::QUInt8) + +//! IS_QINT8("operand") -> bool (true if operand has 'QInt8' output) +#define IS_QINT8(X) EQ(DTYPE_OF(X), DType::QInt8) + +//! IS_QINT16("operand") -> bool (true if operand has 'QInt16' output) +#define IS_QINT16(X) EQ(DTYPE_OF(X), DType::QInt16) + +//! IS_QUINT16("operand") -> bool (true if operand has 'QUInt16' output) +#define IS_QUINT16(X) EQ(DTYPE_OF(X), DType::QUInt16) + +//! IS_QINT32("operand") -> bool (true if operand has 'QInt32' output) +#define IS_QINT32(X) EQ(DTYPE_OF(X), DType::QInt32) +//! IS_INT32("operand") -> bool (true if operand has 'Int32' output) +#define IS_INT32(X) EQ(DTYPE_OF(X), DType::Int32) + +//! IS_INT64("operand") -> bool (true if operand has 'Int64' output) +#define IS_INT64(X) EQ(DTYPE_OF(X), DType::Int64) + +//! IS_QUANT_TYPE("operand") -> bool (true if operand has 'Quantized' output) +#define IS_QUANT_TYPE(X) OR(IS_QUINT8(X), IS_QINT8(X), IS_QINT16(X), IS_QUINT16(X), IS_QINT32(X)) +//! IS_QUANT_SIGNED("operand") -> bool (true if operand has 'Signed Quantized' output) +#define IS_QUANT_SIGNED(X) OR(IS_QINT32(X), IS_QINT16(X), IS_QINT8(X)) +//! IS_SIGNED_SYMM("operand") -> bool (true if operand has 'Signed Quantized' output with offset == 0) +#define IS_SIGNED_SYMM(X) AND(IS_QUANT_SIGNED(X), EQ(ZERO_OFFSET_OF(X), 0)) + +// The problem with IS_SIGNED_SYMM is that it tends to get used as +// AND( IS_QINT8(X), IS_SIGNED_SYMM(X)) +// which expands to X.dtype==qint8 && ( (X.dtype ==qint32 || X.dtype == .. ) && X.zero_offs == 0) +// So, use IS_QINT8_SYMM(X) etc instead. + +//! IS_QINT8_SYMM("operand") -> bool (true if operand has QINT8 output with offset == 0) +#define IS_QINT8_SYMM(X) AND(IS_QINT8(X), EQ(ZERO_OFFSET_OF(X), 0)) +//! IS_QINT16_SYMM("operand") -> bool (true if operand has QINT16 output with offset == 0) +#define IS_QINT16_SYMM(X) AND(IS_QINT16(X), EQ(ZERO_OFFSET_OF(X), 0)) +//! IS_QINT32_SYMM("operand") -> bool (true if operand has QINT32 output with offset == 0) +#define IS_QINT32_SYMM(X) AND(IS_QINT32(X), EQ(ZERO_OFFSET_OF(X), 0)) + +//! IS_FULLY_CONNECT_WEIGHT("operand") -> bool (true if operand is QUInt8 or (QInt8 and symmetrically quantized)) +#define IS_FULLY_CONNECT_WEIGHT(X) OR(IS_QUINT8(X), IS_QINT8_SYMM(X)) + +//! IS_FLOAT16_BOTH("operand", "operand") -> bool (true if both operands are FP16 type) +#define IS_FLOAT16_BOTH(X, Y) AND(IS_FLOAT16(X), IS_FLOAT16(Y)) +//! IS_FLOAT16_ALL("operand", ...) -> bool (true if all operands are FP16 type) +#define IS_FLOAT16_ALL(...) IS_DTYPE_ALL(DType::Float16, __VA_ARGS__) +//! IS_FLOAT32_ALL("operand", ...) -> bool (true if all operands are FP32 type) +#define IS_FLOAT32_ALL(...) IS_DTYPE_ALL(DType::Float32, __VA_ARGS__) + +//! DIM_CHANNEL("operand") -> unsigned (extract depth dimension, #4) +#define DIM_CHANNEL(X) DIM_OF(X, 4) +//! DIM_DEPTH("operand") -> unsigned (extract depth dimension, #3) +#define DIM_DEPTH(X) DIM_OF(X, 3) +//! DIM_WIDTH("operand") -> unsigned (extract width dimension, #2) +#define DIM_WIDTH(X) DIM_OF(X, 2) +//! DIM_HEIGHT("operand") -> unsigned (extract height dimension, #1) +#define DIM_HEIGHT(X) DIM_OF(X, 1) +//! DIM_BATCHES("operand") -> unsigned (extract batches dimension, #0) +#define DIM_BATCHES(X) DIM_OF(X, 0) + +//! DIM_NFILTS("operand") -> unsigned (extract 'output depth' dimension from filter weights, #3) +#define DIM_NFILTS(X) DIM_OF(X, 3) +//! DIM_FILTDEPTH("operand") -> unsigned (extract 'input depth' dimension from filter weights, #2) +#define DIM_FILTDEPTH(X) DIM_OF(X, 2) +//! DIM_FILTWIDTH("operand") -> unsigned (extract 'filter width' dimension from filter weights, #1) +#define DIM_FILTWIDTH(X) DIM_OF(X, 1) +//! DIM_FILTHEIGHT("operand") -> unsigned (extract 'filter height' dimension from filter weights, #0) +#define DIM_FILTHEIGHT(X) DIM_OF(X, 0) + +#define MAX_SPARSE_ELEMENTS(X) DIM_OF(X, (MAX_DIMENSIONS - 1)) + +//! IS_EMPTY_DIM("operand", dim) -> bool (true if size of dim is 0) +#define IS_EMPTY_DIM(X, DIM) EQ(DIM_OF(X, DIM), 0) + +//! IS_EMPTY("operand") -> bool (true if size of all dims is 0) +#define IS_EMPTY(X) AND(IS_EMPTY_DIM(X, 0), IS_EMPTY_DIM(X, 1), IS_EMPTY_DIM(X, 2), IS_EMPTY_DIM(X, 3)) + +} // namespace constraint_lib +/** @} */ + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h new file mode 100755 index 0000000000000..4cb348c637953 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h @@ -0,0 +1,609 @@ +//============================================================================== +// +// Copyright (c) 2018 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef CONVERSIONS_H +#define CONVERSIONS_H + +#include +#include +#include +#include +#include + +#include "builtin_intrinsics.h" + +#ifdef __hexagon__ +#include "hexagon_protos.h" +#endif + +#include "float16.h" + +#if defined(__clang__) +#define ATTR_NO_SANITIZE(CATEGORY) __attribute__((no_sanitize(CATEGORY))) +#else +#define ATTR_NO_SANITIZE(CATEGORY) /*empty */ +#endif + +namespace hnnx { + +namespace scast { + +// for a given floating type F, and a integer type TI, +// intrange_within_float::max() +// generates the largest value representable in type F which will fit into TI without overflow. +// in many cases this is F(std::numeric_limits::max()), +// but there are exceptions when the mantissa of F is narrower than TI; in those cases we +// want the representable value which is smaller than the integer's max value, not the nearest: +// F TI +// Float16 int16 32752.0 (0x7ff0) +// Float15 uint16 65504.0 (0xffe0) +// float int32 2147483520.0 (0x7fffff80) +// float uint32 4294967040.0 (0xFFFFFF00) +// float int64 9.223371487e18 (0x7fff_ff80_0000_0000) +// float uint64 1.844674297e+19 (0xFFFF_FF00__0000_0000) +// double int64 9223372036854774784.0 (0x7FFF_FFFF_FFFF_FC00) +// double uint64 18446744073709549568.0 (0xFFFF_FFFF_FFFF_F800) +// +// All of the 'min' limits are zero or powers of 2, so those can be converted +// directly from std::numeric_limits::min() +// +// +template struct intrange_within_float { +}; + +// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time +template struct intrange_within_float { + static_assert(std::numeric_limits::is_integer); + static inline constexpr Float16 max() + { + if constexpr (sizeof(TI) < 2) { + return Float16(std::numeric_limits::max()); + } else if constexpr (sizeof(TI) == 2) { + return std::numeric_limits::is_signed ? Float16(32752.0f) : Float16(65504.0f); + } else { + return std::numeric_limits::is_signed ? Float16(-65504.0f) : Float16(65504.0f); + } + } + // 'min' value of integer range is always exactly representable + static inline constexpr Float16 min() { return Float16(std::numeric_limits::min()); } +}; + +template struct intrange_within_float { + static_assert(std::numeric_limits::is_integer); + static inline constexpr float max() + { + if constexpr (sizeof(TI) < 4) { + return float(std::numeric_limits::max()); + } else if constexpr (sizeof(TI) == 4) { + return std::numeric_limits::is_signed ? 2147483520.0f : 4294967040.0f; + } else { + static_assert(sizeof(TI) == 8); + return std::numeric_limits::is_signed ? 9.223371487e18f : 1.844674297e+19f; + } + } + // 'min' value of integer range is always exactly representable + static inline constexpr float min() { return float(std::numeric_limits::min()); } +}; + +template struct intrange_within_float { + static_assert(std::numeric_limits::is_integer); + static inline constexpr double max() + { + if constexpr (sizeof(TI) < 8) { + return double(std::numeric_limits::max()); + } else { + static_assert(sizeof(TI) == 8); + return std::numeric_limits::is_signed ? 9223372036854774784.0 : 18446744073709549568.0; + } + } + // 'min' value of integer range is always exactly representable + static inline constexpr float min() { return double(std::numeric_limits::min()); } +}; +// LCOV_EXCL_STOP + +template struct satcast_helper { + static_assert(std::numeric_limits::is_specialized && std::numeric_limits::is_specialized); + static inline TOUT constexpr op(TIN val) + { + if constexpr (!std::numeric_limits::is_integer) { // convert to a float + return TOUT(val); + } else { + constexpr bool OUTS = std::numeric_limits::is_signed; + if constexpr (std::numeric_limits::is_integer) { + // integer to integer. + // widening? or same width, same signedness? + constexpr bool INS = std::numeric_limits::is_signed; + if (sizeof(TOUT) > sizeof(TIN) || (sizeof(TOUT) == sizeof(TIN) && OUTS == INS)) { + // if the output is unsigned and the input < 0, return 0 + // otherwise it's a normal cast. + return (!OUTS && INS && val < 0) ? TOUT(0) : TOUT(val); + } else if (sizeof(TOUT) == sizeof(TIN)) { + if (!OUTS) { // same size, different signs + return (val < 0) ? (TOUT)0 : (TOUT)val; // signed->unsigned + } else { + constexpr TIN lim = std::numeric_limits::max(); + return (val > lim) ? (TOUT)lim : (TOUT)val; + } + } else { + // narrowing conversion + if (!OUTS) { + constexpr TIN m = std::numeric_limits::max(); + return (val < 0) ? TOUT(0) : (val > m) ? TOUT(m) : TOUT(val); + } else { + constexpr TIN mn = INS ? std::numeric_limits::min() : 0; + constexpr TIN mx = std::numeric_limits::max(); + return (val < mn) ? TOUT(mn) : (val > mx) ? TOUT(mx) : TOUT(val); + } + } + } else { // float to integer + if constexpr (sizeof(TOUT) <= sizeof(int32_t)) { + if constexpr (OUTS) { + constexpr TIN loval = intrange_within_float::min(); + constexpr TIN hival = intrange_within_float::max(); + int32_t const tmp = (int32_t)std::max(loval, std::min(hival, val)); + return satcast_helper::op(tmp); + } else { + constexpr TIN loval = 0.0; + constexpr TIN hival = intrange_within_float::max(); + uint32_t const tmp = (uint32_t)std::max(loval, std::min(hival, val)); + return satcast_helper::op(tmp); + } + } else { // 64-bit output assumed + constexpr TIN loval = intrange_within_float::min(); + constexpr TIN hival = intrange_within_float::max(); + return (TOUT)std::max(loval, std::min(hival, val)); + } + } + } + } +}; +// specialize for conversion to same +template struct satcast_helper { + static_assert(std::numeric_limits::is_specialized); + static inline TT constexpr op(TT val) { return val; } +}; + +#ifdef __hexagon__ + +// saturate to types <= int. +template struct q6_sat_int { +}; +template <> struct q6_sat_int { + static inline int op(int x) { return Q6_R_satb_R(x); } +}; +template <> struct q6_sat_int { + static inline int op(int x) { return Q6_R_satub_R(x); } +}; +template <> struct q6_sat_int { + static inline int op(int x) { return Q6_R_sath_R(x); } +}; +template <> struct q6_sat_int { + static inline int op(int x) { return Q6_R_satuh_R(x); } +}; + +// TODO: these should be done again for 'long' if long is also 32 bits. +#if 0 // NOTE: we can't really do this unless intrinsics are constexpr +template <> struct satcast_helper { + static inline uint8_t /*constexpr*/ op(int val) + { + return Q6_R_satub_R(val); + } +}; +template <> struct satcast_helper { + static inline int8_t /*constexpr*/ op(int val) { return Q6_R_satb_R(val); } +}; +template <> struct satcast_helper { + static inline uint16_t /*constexpr*/ op(int val) + { + return Q6_R_satuh_R(val); + } +}; +template <> struct satcast_helper { + static inline int16_t /*constexpr*/ op(int val) { return Q6_R_sath_R(val); } +}; +#endif + +#endif +} // end namespace scast + +} // namespace hnnx + +/** + * @brief saturate_cast( TIN val ) will work on any two numeric types; + * if the input is outside the numeric range of the output type, it + * will be range-limited. + * + * it works as follows: + * * if TOUT is a floating type, the operation is the same as the C++ cast. + * * if TOUT is integer and TIN is float, the input is first converted + * to one of int32,uint32, int64, uint64 ensuring that out-of-range values + * are clipped; and then converted to the output type as below (if it is smaller + * than 32 bits) (The 2-step conversion is intended to work well when things + * are specialized to support native hexagon ops). + * * Otherwise they are both integers. + * - If the output width is larger than the input (or if they are the same size + * and of the same signedness): + * * if the output is unsigned, and the input is < 0, the result is zero + * * otherwise the result is the same as a C++ cast (all values representable) + * - Otherwise, it is a saturating cast; values are limited to the range of TOUT. + */ +template inline constexpr TOUT saturate_cast(TIN val) +{ + return hnnx::scast::satcast_helper::op(val); +} + +/** + * @brief T saturate_round( float val ) + * round val to nearest int, and saturate to range of T. + * + * T must be an integer type, at most 32 bits. + */ +// For general C platform, we need to clip the range before converting to int; +// for hexagon the conversions saturate. +// +#ifndef __hexagon__ +template inline TOUT saturate_round(float val) +{ + static_assert(sizeof(TOUT) <= 8 && std::numeric_limits::is_integer); + return saturate_cast(std::nearbyintf(val)); +} + +#else +template inline TOUT saturate_round(float val) +{ + static_assert(sizeof(TOUT) <= 8 && std::numeric_limits::is_integer); + if constexpr ((sizeof(TOUT) == 8) && !std::numeric_limits::is_signed) { + // convert to unsigned u64, rounding, saturating + return Q6_P_convert_sf2ud_R(val); + } else if constexpr ((sizeof(TOUT) == 8) && std::numeric_limits::is_signed) { + // convert to int64, rounding + return Q6_P_convert_sf2d_R(val); + } else if constexpr ((sizeof(TOUT) == 4) && !std::numeric_limits::is_signed) { + // convert to unsigned u32, rounding, saturating + return Q6_R_convert_sf2uw_R(val); + } else { + // convert to int32,rounding; + int const r = Q6_R_convert_sf2w_R(val); + if constexpr (sizeof(TOUT) < 4) return static_cast(hnnx::scast::q6_sat_int::op(r)); + return static_cast(r); // LCOV_EXCL_LINE [SAFTYSWCCB-1736] + } +} +#endif + +namespace hnnx { + +/** + * @brief 'proper' compare of any two integer types + * proper_gt( a, b) => a > b; + * E.g. if a is unsigned and b is signed, the operation checks to see if b is < 0; + * if so, the result is true; otherwise an unsigned compare is done: a > (unsigned)b + * + */ +namespace prpercmp { + +/** + * @brief if both A and B are either *int*, or smaller than int, + * then promote them both to int and compare them. + * + * otherwise, if TA is wider than TB, (or the same, with TA unsigned): + * promote b to TA, and then compare them. + * Exception, if TA is unsigned and TB is signed and b < 0; then a struct proper_cmp_helper { + static_assert(std::numeric_limits::is_integer && std::numeric_limits::is_integer); + static const bool ASIGNED = std::numeric_limits::is_signed; + static const bool BSIGNED = std::numeric_limits::is_signed; + + // compare by promoting both to int, when... + static const bool CMP_AS_INT = (sizeof(TA) < sizeof(int) || (sizeof(TA) == sizeof(int) && ASIGNED)) && + (sizeof(TB) < sizeof(int) || (sizeof(TB) == sizeof(int) && BSIGNED)); + // otherwise, compare by promoting B to A when ... + static const bool B_TO_A = sizeof(TA) > sizeof(TB) || (sizeof(TA) == sizeof(TB) && !ASIGNED); + // otherwise, compare by promoting A to B + + static inline bool constexpr eq(TA a, TB b) + { + if (CMP_AS_INT) { + return (int)a == (int)b; + } else if (B_TO_A) { + if (!ASIGNED && BSIGNED && b < 0) return false; + return a == (TA)b; + } else { + if (!BSIGNED && ASIGNED && a < 0) return false; + return (TB)a == b; + } + } + static inline bool constexpr lt(TA a, TB b) + { + if (CMP_AS_INT) { + return (int)a < (int)b; + } else if (B_TO_A) { + if (!ASIGNED && BSIGNED && b < 0) return false; // a < b always false if b<0 + return a < (TA)b; + } else { + if (!BSIGNED && ASIGNED && a < 0) return true; // a < b always true if a<0 + return (TB)a < b; + } + } +}; +/** + * @brief specialize for comparison to same type + */ +template struct proper_cmp_helper { + static_assert(std::numeric_limits::is_integer); + static inline bool constexpr eq(T a, T b) { return a == b; } + static inline bool constexpr lt(T a, T b) { return a < b; } +}; + +} // end namespace prpercmp + +} // namespace hnnx + +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value. + * proper_eq(a,b) => a == b; + * + * E.g. if a is signed and <0, and b is unsigned, result will always be false. + * + */ + +template inline bool constexpr proper_eq(TA a, TB b) +{ + return hnnx::prpercmp::proper_cmp_helper::eq(a, b); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_ne(a,b) => !proper_eq(a,b); + */ +template inline bool constexpr proper_ne(TA a, TB b) +{ + return !hnnx::prpercmp::proper_cmp_helper::eq(a, b); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_lt(a,b) => a inline bool constexpr proper_lt(TA a, TB b) +{ + return hnnx::prpercmp::proper_cmp_helper::lt(a, b); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_ge(a,b) => a>=b; + */ +template inline bool constexpr proper_ge(TA a, TB b) +{ + return !hnnx::prpercmp::proper_cmp_helper::lt(a, b); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_gt(a,b) => a>b; + */ +template inline bool constexpr proper_gt(TA a, TB b) +{ + return hnnx::prpercmp::proper_cmp_helper::lt(b, a); +} +/** + * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value + * proper_le(a,b) => a<=b; + */ +template inline bool constexpr proper_le(TA a, TB b) +{ + return !hnnx::prpercmp::proper_cmp_helper::lt(b, a); +} +/** + * @brief x >= lo && x < limit, using proper compares + */ +template inline bool constexpr proper_inrange(TA x, TB lo, TC limit) +{ + return proper_ge(x, lo) && proper_lt(x, limit); +} + +/** + * @brief x >= lo && x <= hi, using proper compares + */ +template inline bool constexpr proper_inrange_closed(TA x, TB lo, TC hi) +{ + return proper_ge(x, lo) && proper_le(x, hi); +} + +/** + * @brief find the 'width' of an unsigned value (# of bits needed to contain it) + * this is floor( log2(x))+1 + * (and 0 when x = 0) + * + */ +inline int constexpr binary_bitwidth(unsigned x) +{ + return (x == 0) ? 0 : (sizeof(unsigned) * 8 - HEX_COUNT_LEADING_ZERO(x)); +} +/** + * @brief find the 'width' of an unsigned long value (# of bits needed to contain it) + * this is floor( log2(x))+1 + * (and 0 when x = 0) + * + */ +inline int constexpr binary_bitwidth(unsigned long x) +{ + return (x == 0) ? 0 : (sizeof(unsigned long) * 8 - HEX_COUNT_LEADING_ZERO_UL(x)); +} +/** + * @brief find the 'width' of an unsigned long long value (# of bits needed to contain it) + * this is floor( log2(x))+1 + * (and 0 when x = 0) + * + */ +inline int constexpr binary_bitwidth(unsigned long long x) +{ + return (x == 0) ? 0 : (sizeof(unsigned long long) * 8 - HEX_COUNT_LEADING_ZERO_ULL(x)); +} +/** + * @brief saturating u32+u32 add + */ +inline uint32_t /*constexpr*/ addu32_sat(uint32_t a, uint32_t b) +{ + uint64_t const prod = (uint64_t)a + b; + return saturate_cast(prod); +} + +/** + * @brief saturating i32+i32 add + */ +inline int32_t /*constexpr*/ addi32_sat(int32_t a, int32_t b) +{ +#ifdef __hexagon__ + return Q6_R_add_RR_sat(a, b); +#else + int64_t prod = (int64_t)a + b; + return saturate_cast(prod); +#endif +} + +/** + * @brief saturating u32xu32 multiply + */ +inline uint32_t constexpr mulu32_sat(uint32_t a, uint32_t b) +{ + uint64_t const prod = (uint64_t)a * b; + return saturate_cast(prod); +} + +/** + * @brief saturating i32xi32 multiply + */ +inline int32_t constexpr muli32_sat(int32_t a, int32_t b) +{ + int64_t const prod = (int64_t)a * b; + return saturate_cast(prod); +} + +/** + * @brief saturating u64xu64 multiply + */ +inline uint64_t /*constexpr*/ mulu64_sat(uint64_t a, uint64_t b) +{ + uint64_t prod = 0; + if (HEX_MUL_OVERFLOW(a, b, &prod)) { + prod = std::numeric_limits::max(); + } + return prod; +} + +/** + * @brief saturating i64xi64 multiply + */ +inline int64_t /*constexpr*/ muli64_sat(int64_t a, int64_t b) +{ + int64_t prod = 0; + if (HEX_MUL_OVERFLOW(a, b, &prod)) { + prod = (int64_t(uint64_t(a) ^ uint64_t(b)) >= 0) ? std::numeric_limits::max() + : std::numeric_limits::min(); + } + return prod; +} +/** + * @brief add unsigned+unsigned->unsigned, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline unsigned constexpr addu32_modular(unsigned a, unsigned b) +{ + return a + b; +} +/** + * @brief subtract unsigned-unsigned->unsigned, escaping 'unsigned overflow' checks + * For '-unsigned_var', use subu32_modular(0,unsigned_var) + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline unsigned constexpr subu32_modular(unsigned a, unsigned b) +{ + return a - b; +} +/** + * @brief multiply unsigned*unsigned->unsigned, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline unsigned constexpr mulu32_modular(unsigned a, unsigned b) +{ + return a * b; +} +/** + * @brief mul-add u32*u32+u32->u32, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline unsigned constexpr muladdu32_modular(unsigned a, unsigned b, unsigned c) +{ + return a * b + c; +} + +/** + * @brief add u64+u64->u64, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline uint64_t constexpr addu64_modular(uint64_t a, uint64_t b) +{ + return a + b; +} + +/** + * @brief subtract u64-u64->u64, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline uint64_t constexpr subu64_modular(uint64_t a, uint64_t b) +{ + return a - b; +} +/** + * @brief mul u64*u64->u64, escaping 'unsigned overflow' checks + */ +ATTR_NO_SANITIZE("unsigned-integer-overflow") +inline uint64_t constexpr mulu64_modular(uint64_t a, uint64_t b) +{ + return a * b; +} + +/** + * @brief 'image' conversion from TIN to TOUT (which must be the same size) + * e.g. image_convert( 1.25f) -> 0x3fa00000 + */ + +template inline constexpr TOUT image_convert(TIN x) +{ + static_assert(sizeof(TOUT) == sizeof(TIN)); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_constructible_v); + TOUT out; + std::memcpy(&out, &x, sizeof(TOUT)); + return out; +} + +// round up A to a multiple of B. +// b is expected to be > 0 even if signed. + +template inline constexpr size_t round_up(size_t a, TD b) +{ + static_assert(std::is_integral_v, "round_up can only apply to integer types"); + // for b being a power of 2, this should compile as (a+(b-1)) &~(b-1) + return b * ((a + (b - 1)) / b); +} +// for int, b is expected to be > 0; +// this will work for negative a, e.g. round_up(-53,10) -> -50 +template inline constexpr size_t round_up(int a, TD b) +{ + static_assert(std::is_integral_v, "round_up can only apply to integer types"); + int const bi = b; + int const tmp = a + ((a > 0) ? (bi - 1) : 0); + return bi * (tmp / bi); +} + +#endif /*CONVERSIONS_H*/ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h new file mode 100755 index 0000000000000..8f0b21ccb86e5 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h @@ -0,0 +1,38 @@ +//============================================================================== +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef COST_H +#define COST_H 1 + +// NOTE: WHATCOST may be something like SNAIL/128 +#define COST_OF(FUNC, WHATCOST) COST_OF_OP(typename DerivedType<(FUNC)>::type, WHATCOST) +#define COST_OF_F(FUNC, WHATCOSTFN) COST_OF_OP_F(typename DerivedType<(FUNC)>::type, WHATCOSTFN) + +#ifdef PREPARE_DISABLED +#define COST_OF_OP(OP, WHATCOST) +#define COST_OF_OP_F(OP, WHATCOSTFN) +#else +#define COST_OF_OP(OP, WHATCOST) \ + template <> [[maybe_unused]] constexpr hnnx::cost_function_t hnnx::get_costf() \ + { \ + return hnnx::cost_function_t(float(StandardCosts::WHATCOST)); \ + } + +#define COST_OF_OP_F(OP, WHATCOSTFN) \ + template <> \ + float hnnx::cost_function_t::cfunc(hnnx::cost_function_t const &, const Graph &graph_in, const Op *op) \ + { \ + return WHATCOSTFN(graph_in, op); \ + } \ + template <> [[maybe_unused]] constexpr hnnx::cost_function_t hnnx::get_costf() \ + { \ + return hnnx::cost_function_t(hnnx::cost_function_t::cfunc, 1.0); \ + } +#endif + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h new file mode 100755 index 0000000000000..286945b9b34b8 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h @@ -0,0 +1,56 @@ +//============================================================================= +// +// Copyright (c) 2020 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================ + +#ifndef COST_FUNCS_H +#define COST_FUNCS_H +#include +#include +#include "weak_linkage.h" +#include "macros_attribute.h" +PUSH_VISIBILITY(default) + +class Graph; +class Op; + +namespace hnnx { + +class API_EXPORT cost_function_t { + using inner_func_t = float (*)(cost_function_t const &, const Graph &, Op const *); + inner_func_t funcp; + float val; + + public: + cost_function_t(cost_function_t const &) = default; + cost_function_t &operator=(cost_function_t const &) = default; + constexpr explicit cost_function_t(float val_in) : funcp(simple_cost_function), val(val_in) {} + constexpr cost_function_t(inner_func_t f, float val_in) : funcp(f), val(val_in) {} + constexpr cost_function_t() noexcept : funcp(simple_cost_function), val(0.0f) {} + + inline float operator()(const Graph &graph_in, Op const *op) const { return (*funcp)(*this, graph_in, op); } + static float simple_cost_function(cost_function_t const &self, const Graph &, Op const *) + { + return self.val; + } // just returns val; + + float get_val() const { return val; } + + // unreliable compare for two cost func: returns -1,0,1 if this cost + // is <,=,> than rhs cost, with the second result being true; or <0,false> + // if it can't tell. + std::pair compare(cost_function_t const &rhs) const; + + template static float cfunc(cost_function_t const &, const Graph &, Op const *); +}; + +API_EXPORT cost_function_t cost_func_from_str(std::string_view); + +} // namespace hnnx + +POP_VISIBILITY() + +#endif diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h new file mode 100755 index 0000000000000..494f51e40fa0f --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h @@ -0,0 +1,471 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +/* + * crate.h + * + * Created on: Aug 1, 2019 + * Author: smithg + */ + +#ifndef CRATE_H_ +#define CRATE_H_ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "is_detected.h" +#include "forward_classes.h" +#include "macros_attribute.h" +#include "weak_linkage.h" +#include "size_align_code.h" + +PUSH_VISIBILITY(default) + +class Graph; +class Tensor; + +/// @brief A 'Crate' allows construction of some number of different data types, +/// contiguously packed into a few large memory blocks. +/// +/// Example: +/// +/// Crate crt; +/// Thing tp* = crt.emplace( ... ctor parms for Thing ... ) +/// AnotherThing tp2* = crt.emplace( ... ctor parms for AnotherThing ... ) +/// +/// When the crate is destroyed, all of the contained objects are destroyed in the reverse +/// order. You cannot 'remove' a single entry using +/// +/// crt.erase has been deprecated +/// +/// However, this is likely not going to free any memory; it will just call the dtor of the +/// object (and make sure it doesn't get called later, when the Crate is cleared or destroyed). +/// +/// You can also emplace variable-sized arrays of trivially-destructable objects. +/// +/// alloc_array does not initialize: +/// +/// float * farr = crt.alloc_array(n); +/// +/// alloc_array_zero does zero-initializing: +/// +/// int * farr = crt.alloc_array_zero(n); +/// +/// If an allocation needs space larger than CHUNKBYTES, it will get its own chunk. +/// +// Each record containing an object has a non-null 'dtor' field; if the object is trivially destructible, +// this will be (dtor_funcp)1, and the object is not on the linked-list. +// +// note: +// A constructor may emplace additional records in the crate recursively. Likewise, +// it's OK if the dtors call erase() on other objects. If this happens during a 'clear', +// the erase calls are ignored since the other objects are going to get dtor'd anyhow (if they have not +// been already). +// Important: if object A's constructor places B into the crate, then B will very likely get destroyed +// first when the crate is cleared. Thus, A's destructor can't look at B (it can erase B, which is ignored +// as described above). + +// +// new 'raw' mode: +// - when the crate is in 'raw' mode, no destructors are registered. inserting an object +// increases 'alloc_count' in the chunk header, but does not increment 'nrec', nor any +// does it increase Crate::m_records. +// - raw mode is entered by enable_raw_mode(size_needed): +// which does this in addition to enabling raw mode: +// - if there is no current chunk, or if the current chunk doesn't have room for 'size_needed' bytes, +// a new chunk is added which does. +// - enable_raw_mode(size_needed) returns a chunk handle. +// +// Internally, raw_mode causes add_record_slot() to do the same thing, but it only moves alloc_count, it does +// not assign a slot index, and 'idx' is -1 in the returned struct. +// All callers of add_record_slot() *must* check for raw mode (can be done by checking idx < 0), and then avoid +// adding an dtor or doing '++m_records'. +// +// it's also possible to call .enable_raw_mode(), disable_raw_mode() +// but .enable_raw_mode() does nothing if there isn't at least one chunk allocated. +// + +namespace hnnx { + +// +// This is used to statically determine whether a type T has a clear(Graph&) +// method. This is used as an additional destructor which takes a Graph +// reference. +// + +template using clear_t = decltype(std::declval().clear(std::declval())); + +template constexpr bool has_clear = is_detected_v; + +class Deserz; +class DCrate; + +class Crate { + API_EXPORT static constexpr size_t CHUNKBYTES = (1 << 16); + static_assert(CHUNKBYTES % 8 == 0 && CHUNKBYTES >= 128); + typedef void (*dtor_funcp)(Graph *graph_in, void *); + API_EXPORT static dtor_funcp DTOR_TRIVIAL() { return (dtor_funcp)1; } + API_EXPORT static dtor_funcp DTOR_IN_PROCESS() { return (dtor_funcp)2; } + + //! A record in the index of a chunk + struct index_rec { + unsigned loc; ///< offset in bytes to the object + dtor_funcp + dtor; ///< pointer to dtor function (null if empty record; (DTOR_TRIVIAL if the object is trivial dtor) + }; + //! A chunk record in the crate. + /// + /// Each chunk is created as an array of uint64_t, via make_unique + /// The memory in a chunk has a chunkhdr, which is followed by: + /// + /// [Objects][Objects][Objects]--> free space <--[Index records] + /// + /// 'alloc_count' is the next offset available to be allocated. + /// index records are entered in reverse order from the end. So, the last nrec*sizeof(index_rec) + /// bytes of the area, are the index. + /// + typedef std::unique_ptr uptr_chunk_t; + struct chunkhdr; + API_EXPORT static chunkhdr *hdr_of(uptr_chunk_t &p) { return reinterpret_cast(p.get()); } + API_EXPORT static chunkhdr const *hdr_of(uptr_chunk_t const &p) + { + return reinterpret_cast(p.get()); + } + /// The chunkhdr is the first portion of the chunk, and is immediately followed + /// by data_len bytes, which is a multiple of 8. + struct API_EXPORT alignas(8) chunkhdr { + unsigned data_len; ///< length of the data area following header, bytes (>=CHUNKBYTES). + unsigned nrec; ///< records in use (including deleted ones) + unsigned alloc_count; ///< offset of first byte in 'free space' + // init to a given length (header not included) + void init(unsigned length) + { + data_len = length; + nrec = 0; + alloc_count = 0; + } + // reset (preserve data_len) + void init() + { + nrec = 0; + alloc_count = 0; + } + // pointer to 'offs ' within data area + inline uint8_t *get_ptr(unsigned offs) { return (uint8_t *)(this + 1) + offs; } + // pointer to end of the allocation + inline uint8_t *get_end_ptr() { return (uint8_t *)(this + 1) + data_len; } + // amount of space remaining + inline size_t space_avail() const { return data_len - alloc_count - nrec * sizeof(index_rec); } + // get pointer to an index record. + // record 0 is the last (oldest) one. + index_rec *index_p(int idx) { return (index_rec *)get_end_ptr() - (idx + 1); } + static uptr_chunk_t allocate(unsigned len); + }; + std::vector m_chunks; /// < chunks with data + std::vector m_free; /// < chunks without + typedef std::vector::iterator chunk_iter; + + bool m_rawmode = false; + bool m_clearing = false; ///< set while clearing. + size_t m_allrecords = 0; ///< includes removed and 'padding' records + size_t m_records = 0; ///< only actual, non-erased records. + + //! Returned from add_record_slot (which is used to create a new record) + struct recposn { + chunkhdr *chunkp; ///< the chunk in which it was found + void *objp; ///< pointer to the object + int idx; ///< index within the chunk (= -1 if insert was done in raw mode) + }; + API_EXPORT recposn add_record_slot(size_t bytes, size_t align); + API_EXPORT void recover_ctor_throw(recposn const &) noexcept; + API_EXPORT void install_dtor(recposn const &, dtor_funcp dtor_func); + API_EXPORT void move_to_free(chunk_iter chunk_to_free); + + public: + class ChunkHandle { + friend class Crate; + chunkhdr *chunkp; + + protected: + ChunkHandle(chunkhdr *cp) : chunkp(cp){}; + + public: + ChunkHandle() : chunkp(nullptr) {} // null handle may only be assigned-to + ChunkHandle(ChunkHandle const &) = default; + ChunkHandle &operator=(ChunkHandle const &) = default; + friend inline bool operator==(ChunkHandle const &a, ChunkHandle const &b) { return a.chunkp == b.chunkp; } + std::pair get_memory_extent() const + { + size_t const len = chunkp->get_ptr(chunkp->alloc_count) - (uint8_t *)chunkp; + return {chunkp, len}; + } + }; + + API_EXPORT Crate(); ///< Construct a new Crate + Crate(Crate const &) = delete; + Crate &operator=(Crate const &) = delete; + + // get the preload handle for the first chunk + ChunkHandle first_chunk_handle() const + { + return ChunkHandle(m_chunks.empty() ? nullptr : hdr_of(const_cast(*this).m_chunks.front())); + } + // get the preload handle for the most recent chunk + ChunkHandle last_chunk_handle() const + { + return ChunkHandle(m_chunks.empty() ? nullptr : hdr_of(const_cast(*this).m_chunks.back())); + } + // 'raw mode' + ChunkHandle enable_raw_mode(unsigned bytes_needed); + API_EXPORT void enable_raw_mode(); + void disable_raw_mode() { m_rawmode = false; } + bool raw_mode() const { return m_rawmode; } + + // Note that the destructor doesn't do anything. You have to call clear() manually. + API_EXPORT ~Crate(); + //! The number of objects in the crate. + size_t size() const { return m_records; } + //! The number of chunks in use + size_t chunk_count() const { return m_chunks.size(); } + //! The amount of space left in the current chunk, approximately. + /// DO NOT CALL unless chunk_count() > 0 + size_t current_chunk_space_remain() const { return hdr_of(this->m_chunks.back())->space_avail(); } + //! Delete all objects. Does not necessarily free all storage to the + /// system; but all retained storage is availabe for re-use in the crate. + /// Note that this is no longer called by the destructor- it must be called explicitly. + API_EXPORT void clear(Graph *graph_in); + // Special entry for deserialzing in segments. + // If it is possible to allocate, in current raw-mode chunk, everything from offset 'start' + // up to but not including 'limit', this is done, and the base address of that region is returned. + // otherwise does nothing and returns null. + API_EXPORT void *allocate_bulk(size_t start, size_t limit); + + //! Construct an object of type T into the crate, using the + /// parameters of any constructor of T. It is acceptable for the + /// constructor to call the emplace method to add other objects to + /// the crate. + template API_HIDDEN T *emplace(Args &&...args) + { + recposn const pos = add_record_slot(sizeof(T), alignof(T)); + // construct the object + if constexpr (std::is_nothrow_constructible::value) { + new (pos.objp) T(std::forward(args)...); + } else { + try { + new (pos.objp) T(std::forward(args)...); + } catch (const std::exception &e) { + recover_ctor_throw(pos); + throw; + } + } + if (pos.idx >= 0) { + // register destructor + if constexpr (!std::is_trivially_destructible::value) { + // Obtain a callable '~T()' function. + // this typically generates a jump, or a small inline; lambda can + // be cast to a function pointer since it has no state. + auto dtor_func = [](Graph *graph_in, void *obj) { + if constexpr (has_clear) { + static_cast(obj)->clear(graph_in); + } + static_cast(obj)->~T(); + }; + install_dtor(pos, (dtor_funcp)dtor_func); + } else { + ++m_records; // note, install_dtor does this too. + } + } + return static_cast(pos.objp); + } + + using deserialize_op_func = void *(*)(void *, Deserz &); + using deserialize_dtor_func = void (*)(Graph *, void *); + + // Alternate interface to cut down on template instantations: + // init_func is used to initialize the memory, and dtor_func + // is is used to register the destructor. It's up to the user + // to provide the correct size and alignment. + + API_EXPORT void *emplace_explicit(Deserz &dctx, deserialize_op_func init_func, deserialize_dtor_func dtor_func, + size_align_code_t size_al); + + //! Allocate 'n' of type T in the crate. + /// Will initially be garbage; T must be trivially destructable (unless waived) + template T *alloc_array(size_t n) + { + static_assert(DTOR_OK || std::is_trivially_destructible::value); + if (n == 0) return nullptr; + recposn const pos = add_record_slot(sizeof(T) * n, alignof(T)); + if (pos.idx >= 0) m_records++; + return static_cast(pos.objp); + } + //! Allocate 'n' of type T in the crate. + /// Will be zero-filled; T must be trivially destructable. + template T *alloc_array_zero(size_t n) + { + T *const res = alloc_array(n); + if (n != 0) ::memset(res, 0, sizeof(T) * n); + return res; + } + //! Allocate 'n' of type T in the crate. + /// Will be "value constructed"; in case of things like int and pointer, + /// this means they will be zeroed. + /// + /// T must be trivially destructable. + template T *alloc_array_value(size_t n) + { + T *res = alloc_array(n); + if (n != 0) std::uninitialized_value_construct_n(res, n); + return res; + } +}; + +/* + * EJP: This seems silly, but I don't know how to get visibility into Graph into a templated Tensor because of include hell. + */ + +API_EXPORT Crate *graph_crate(Graph &graph_in); + +// +// replacement for vector, for use in ops; + +// +// limited options for constructor: +// (1) copy, or move, from vector - need Graph *; +// (2) create with a given size, null-initialized; - need Graph *; +// (3) create empty, and then fill in later +// using init( Graph* , std::vector const &) +// or init( Graph* , std::vector &&) +// or init( Graph *, size ) +// or init( Graph *, T const *ptr, size ); +// or init_move( Graph *, T *ptr, size ); + +// With option 3, it assumed that the 'init' is done during the constructor of +// a host object - this is needed during deserialize, for instance. +// the 'len' is 32 bits so this type occupies 2 pointers, vs. 3 for std::vector. +// +// If 'T' has a destructor, the cratevec's destructor will invoke that on +// each element of the vector, in reverse order. +// when the 'move-from' mechanisms to init from 'std::vector && are used, +// the supplied vector will not be cleared; but its elements will all be +// 'moved-from'. + +template class cratevec { + T *m_ptr; + unsigned m_len; + using vec_t = std::vector; + static constexpr bool need_dtor = !std::is_trivially_destructible::value; + + public: + using iterator = T *; + using const_iterator = T const *; + using value_type = T; + using size_type = size_t; + using difference_type = ptrdiff_t; + using reference = T &; + using const_reference = T const &; + + cratevec() : m_ptr(nullptr), m_len(0) {} + cratevec(Graph *g, vec_t const &v) : cratevec() + { + if (!v.empty()) init(g, v.data(), v.size()); + } + cratevec(Graph *g, vec_t &&v) : cratevec() + { + if (!v.empty()) init_move(g, v.data(), v.size()); + } + cratevec(Graph *g, size_t n) : cratevec() { init(g, n); } + cratevec(cratevec const &) = delete; + cratevec(cratevec &&) = delete; + ~cratevec() + { + if constexpr (need_dtor) { + if (m_len > 0) { + T *const ptr0 = m_ptr; + T *ptr = ptr0 + m_len; + do { + ptr--; + ptr->~T(); + } while (ptr > ptr0); + } + } + } + + cratevec &operator=(cratevec const &) = delete; + cratevec &operator=(cratevec &&) = delete; + + void init(Graph *g, T const *data, size_t n) + { + assert(m_len == 0); + if (n) { + m_ptr = graph_crate(*g)->alloc_array(n); + std::uninitialized_copy_n(data, n, m_ptr); + m_len = n; + } + } + void init_move(Graph *g, T *data, size_t n) + { + assert(m_len == 0); + if (n) { + m_ptr = graph_crate(*g)->alloc_array(n); + std::uninitialized_move_n(data, n, m_ptr); + m_len = n; + } + } + // these methods get used during deserialize, so allow it to pass crate in directly. + void init(hnnx::Crate *const crate_p, size_t const n) + { + assert(m_len == 0); + if (n) { + m_ptr = crate_p->alloc_array(n); + std::uninitialized_value_construct_n(m_ptr, n); + m_len = n; + } + } + // The DCrate version is defined in dcrate_inlines.h + void init(hnnx::DCrate *crate_p, size_t n); + + void init(Graph *const g, size_t const n) { init(graph_crate(*g), n); } + void init(Graph *const g, vec_t const &v) { init(g, v.data(), v.size()); } + void init(Graph *const g, vec_t &&v) { init_move(g, v.data(), v.size()); } + + iterator begin() noexcept { return m_ptr; } + iterator end() noexcept { return m_ptr + m_len; } + const_iterator begin() const noexcept { return m_ptr; } + const_iterator end() const noexcept { return m_ptr + m_len; } + const_iterator cbegin() const noexcept { return m_ptr; } + const_iterator cend() const noexcept { return m_ptr + m_len; } + size_type size() const noexcept { return m_len; } + T *data() noexcept { return m_ptr; } + T const *data() const noexcept { return m_ptr; } + bool empty() const noexcept { return m_len == 0; } + reference operator[](size_type idx) { return m_ptr[idx]; } + const_reference operator[](size_type idx) const { return m_ptr[idx]; } + reference at(size_type idx) + { + if (idx >= m_len) throw std::range_error("cratevec"); + return m_ptr[idx]; + } + const_reference at(size_type idx) const { return const_cast(*this).at(idx); } + reference front() { return m_ptr[0]; } + const_reference front() const { return m_ptr[0]; } + reference back() { return m_ptr[m_len - 1]; } + const_reference back() const { return m_ptr[m_len - 1]; } +}; + +} // namespace hnnx + +POP_VISIBILITY() + +#endif /* CRATE_H_ */ diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h new file mode 100755 index 0000000000000..a48e7bc909904 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h @@ -0,0 +1,101 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DCRATE_INLINES_H +#define DCRATE_INLINES_H 1 + +#include +#include +#include + +#include "macros_attribute.h" +#include "deser_concurrent.h" +#include "crate.h" + +namespace hnnx { + +// alloc 'amount' bytes with given alignment. +inline void *DCrate::do_alloc(const size_t align, const size_t amount) +{ + size_t basep = size_t(nextp); + if (align > 4) { + basep = (basep + (align - 1)) & ~(align - 1); + } + size_t const next_base = basep + amount; + if (next_base > (size_t)limitp) return nullptr; + nextp = (void *)next_base; // update 'nextp' ... + return (void *)basep; +} + +template inline T *DCrate::alloc_array(const size_t n) +{ + if (nextp != nullptr) { + void *const allocp = do_alloc(alignof(T), sizeof(T) * n); + if (allocp) return (T *)allocp; + } + return cratep->alloc_array(n); +} + +template inline T *DCrate::emplace(Args &&...args) +{ + if (nextp != nullptr) { + void *const allocp = do_alloc(alignof(T), sizeof(T)); + if (allocp) { + new (allocp) T(std::forward(args)...); + return (T *)allocp; + } + } + return cratep->emplace(std::forward(args)...); +} + +template <> +inline void *DCrate::emplace_explicit(Deserz &dctx, deserialize_op_func const init_func, + deserialize_dtor_func const dtor_func, size_align_code_t const size_al) +{ + if (nextp != nullptr) { + void *const allocp = do_alloc(size_al.align(), size_al.size()); + if (allocp) { + init_func(allocp, dctx); + return allocp; + } + } + return cratep->emplace_explicit(dctx, init_func, dtor_func, size_al); +} + +// this will be used in place of 'emplace' when the constructor parms +// are just 'Deserz &' +template inline T *DCrate::emplace0(Deserz &dctx) +{ + deserialize_op_func const ctor = [](void *const ptr, Deserz &dctx) -> void * { + new (ptr) T(dctx); + return ptr; + }; + if (nextp != nullptr) { + void *const allocp = do_alloc(alignof(T), sizeof(T)); + if (allocp) { + (ctor)(allocp, dctx); + return (T *)allocp; + } + } + return (T *)cratep->emplace_explicit(dctx, ctor, nullptr, size_align_code_t::for_type()); +} +// init method of cratevec using 'Dcrate' is declared here to avoid header inclusion madness. +// +template inline void hnnx::cratevec::init(hnnx::DCrate *crate_p, size_t n) +{ + assert(m_len == 0); + if (n) { + m_ptr = crate_p->alloc_array(n); + std::uninitialized_value_construct_n(m_ptr, n); + m_len = n; + } +} + +} // namespace hnnx + +#endif // DCRATE_INLINES_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h new file mode 100755 index 0000000000000..16db21a082cf1 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h @@ -0,0 +1,288 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DESER_CONCURRENT_H +#define DESER_CONCURRENT_H 1 + +#include +#include +#include +#include +#include +#include +#include + +#include "deser_concurrent_defs.h" + +// this is intended to be included only in "deserialize.h" + +struct PreloadInfo; + +namespace hnnx { +struct runlist_seg_descriptor; +class Crate; +class Deserz; +class fixup_supplemental_recs; +class InitTimeSchedule; + +// describes a 'span' of the deserialized data +struct deser_segment_span { + void *base; + void *limit; +}; + +// This describes a partially-decoded segment; includes fixups. +// This should stay small so we can place it inside Deserz, and std::move it +// out (to keep the fixup list) when done with the segment. +struct runlist_fixup_state { + unsigned segno = 0; + size_t *crate_begin = nullptr; // where the data starts in the crate + runlist_seg_descriptor *seg_desc = nullptr; // Corresponding 'runlist_seg_descriptor' for reference. + // The next three are copied from the runlist_auxdata_seg_desc + uint32_t base_tensor_index = 0; // first tensor index defined this segment + uint32_t base_blocktable_index = 0; // first blocktable index defined in this segment + uint32_t base_sharedobj_index = 0; // first 'shared_object' index defined in this segment + // fixup data + size_t *fixup_list_head = nullptr; // head of the 'fixup list', or null if none. + fixup_supplemental_recs *fixup_supplemental; // supplemental fixup list + + runlist_fixup_state() = default; + ~runlist_fixup_state() = default; + runlist_fixup_state(runlist_fixup_state const &) = default; + // *Some* implementations of c++lib require this to have operator= (non-move) + // in order for std::vector containing it to be constructed via resize. + runlist_fixup_state &operator=(runlist_fixup_state const &) = default; + // the move-ctor and move-assign must leave the source with no fixup list, + // and segno = 0. + runlist_fixup_state(runlist_fixup_state &&from) { do_move_from(std::move(from)); } + runlist_fixup_state &operator=(runlist_fixup_state &&from) + { + do_move_from(std::move(from)); + return *this; + } + + private: + // this is used in move-constructor and move-assign; it will always leave 'from' + // with certain 'null' values to trap cases where we're using the wrong instance. + void do_move_from(runlist_fixup_state &&from) + { + segno = from.segno; + crate_begin = from.crate_begin; + seg_desc = from.seg_desc; + base_tensor_index = from.base_tensor_index; + base_blocktable_index = from.base_blocktable_index; + base_sharedobj_index = from.base_sharedobj_index; + fixup_list_head = from.fixup_list_head; + fixup_supplemental = from.fixup_supplemental; + from.segno = 0; + from.seg_desc = nullptr; + from.fixup_list_head = nullptr; + } +}; +// +// This contains 'supplemental' fixup records for a segment; there is one instance in each runlist_seg_descriptor, +// and a pointer to in the runlist_fixup_state. When the 'runlist_fixup_state' is moved in or out of the Deserz, +// the pointer to this remains. +// To avoid the overhead of vec_push_back, this // has a static array into which values are recorded; +// when this is full (or near full), all the records within are appended to the vector in a single operation. +// At the end of the operation, any remaining records are appended to the vector, but only if the vector +// is not empty (we can read the records out of the fixed array, if they all fit). +// +// The append() is not safe unless 'ensure_room_for' is checked first; you can e.g. do ensure_room_for(3) +// ahead of doing up to 3 append +// It is best to use a constant as parameter to ensure_room_for, i.e. ahead of code which may append +// *up to* 4 values, use ensure_room_for(4); this simplifies the inline expansion of 'ensure_room_for', +// and makes very little difference to performance compared to using the exact value. +// +class fixup_supplemental_recs { + static constexpr unsigned ARR_SIZE = 64; + unsigned num_in_arr = 0; + uint32_t fixed_arr[ARR_SIZE]; + std::vector var_arr; + unsigned n_vec = 0; // = var_arr.size() + + public: + void clear(); + unsigned constexpr size() const { return num_in_arr + n_vec; } + void reserve(unsigned const n) { var_arr.reserve(n); } + inline void ensure_room_for(unsigned const n) + { + assert(n <= ARR_SIZE); + if (num_in_arr > ARR_SIZE - n) flush_to_vec(); + } + // append allowed only when preceded by 'ensure_room_for' + inline void append(uint32_t const val) + { + assert(num_in_arr < ARR_SIZE); + fixed_arr[num_in_arr++] = val; + } + // use instead of 'ensure_room_for(1); push_back(n)' + inline void push_back(uint32_t const val) + { + if (num_in_arr > ARR_SIZE - 1) flush_to_vec(); + fixed_arr[num_in_arr++] = val; + } + // After all push_back() done, do a 'finish' + // and then get_limits() can be used to traverse the data. + void finish(); // flushes, but only if the vec is not empty. + std::pair get_limits() const; + + protected: + void flush_to_vec(); +}; + +// An array of these (size N+1) is used to hold the +// information used in deserializing each each segment. +// The [N+1] is partially used; some operations may use +// e.g. arr[i+1].auxinfo.some_field to find out where something +// ends for the current segment, using the start of the next segment; +// so N-1 entry needs a next. + +struct runlist_seg_descriptor { + runlist_auxdata_seg_desc auxinfo; // the data from the 'aux_data' record for this segment + runlist_fixup_state segfixup; // the deserialization state (moved in and out of Deserz as needed) + fixup_supplemental_recs fixup_supp; // fixup supplemental recs. + deser_segment_span span_to_deser = {}; + // These are used to configure the last preload in each segment, which preloads a region + // which is either partially, or entirely, in the next segment. So, the first two entries + // below are actually set at the end of deserialization of the previous segment; the end_preload + // is set by the current segment deserialize. + // The information stored in [N] is for configuring + // the last preload in the last segment, with end_preload set to 'end of crate'; in this case + // start_preload could be <= the end of the crate, and then we don't configure it. + // likewise the information in [0] is only 'end_preload', which can be used to configure + // 'Graph::m_initial_preload' (it should go from start-of-crate to seg[0].end_preload). + // In some cases (hopefully, only in testing) we may have segments with no preloads in them, + // in which case null pointers will appear in some of these; the ChunkPreload ops need to + // configured by getting info from adjacent segments. + PreloadInfo *prev_seg_final_preload{}; // points to the prev segments' final PreloadInfo + char *start_preload{}; // the preload start address for prev seg's final preload + char *end_preload{}; // end address for prev seg's final preload +}; + +// One instance of this is in Deserializer, called segments. +// It is created 'empty', and populated when we encounter the valid +// Aux Data record. +// +class DeserSegDescs { + unsigned n_segs = 0; + // points to an array of n_seg + 1, if n_segs > 0 + std::unique_ptr seg_arr; + + public: + DeserSegDescs() = default; + ~DeserSegDescs() = default; + DeserSegDescs(DeserSegDescs const &) = delete; + DeserSegDescs(DeserSegDescs &&) = default; + DeserSegDescs &operator=(DeserSegDescs const &) = delete; + DeserSegDescs &operator=(DeserSegDescs &&) = default; + + // these two are used to create the array + void set_size(unsigned const n); // used to create sized, empty array + runlist_seg_descriptor *data() { return seg_arr.get(); } + + constexpr unsigned num_segs() const { return n_segs; } + constexpr bool is_active() const { return n_segs != 0; } + // note: 'i' may be 0 .. num_segs(); only can use when 'is_active'. + runlist_seg_descriptor &operator[](unsigned const i) { return seg_arr[i]; } + runlist_seg_descriptor const &operator[](unsigned const i) const { return seg_arr[i]; } + + // We can add other data in here, to manage the concurrent deserialization. + unsigned n_threads = 0; // set when allocating the 'Deserz' array + std::vector deserz_arr; // sized as 'n_threads'. + + // start-of-crate, rounded to a multiple of 32; Calculated before any multi-thread + // operations. Use to configure Graph::m_initial_preload. + void *crate_preload_start_boundary; + // end-of-crate, rounded up to multiple of 32. Calculated before any multi-thread + // operations. No 'ChunkPreloadOp' will exceed this. + void *crate_preload_final_boundary; + + InitTimeSchedule *initSchedule; +}; + +// A 'DCrate' is a proxy object stored within Deserz. +// It has some of the same methods as Crate; but if nextp is not null, +// it will allocated into the space at 'nextp', limited by 'limitp' +// Otherwise it will use the Crate. +// Most methods are defined as inlines in dcrate_inlines,h +// +class DCrate { + // these are either both null, or both non-null and 4-aligned. + void *nextp = nullptr; + void *limitp = nullptr; + Crate *cratep = nullptr; + + public: + DCrate() {} + ~DCrate() {} + DCrate(DCrate const &) = default; + DCrate(DCrate &&) = default; + DCrate &operator=(DCrate const &) = default; + DCrate &operator=(DCrate &&) = default; + explicit DCrate(Crate &c) : cratep(&c) {} + void set_crate(Crate &c) { cratep = &c; } + Crate *crate() { return cratep; } + bool is_active() const { return nextp != nullptr; } + + constexpr size_t bytes_remaining() const { return (char *)limitp - (char *)nextp; } + char *next_loc() { return (char *)nextp; } + std::pair range_remain() { return {(char *)nextp, (char *)limitp}; } + + void set_memory_range(void *base, unsigned len) + { + nextp = base; + limitp = (void *)((char *)base + len); + } + void remove_memory_range() + { + nextp = nullptr; + limitp = nullptr; + } + + // Methods of Crate we want to support (See crate.h for more more detail). + // Note that the constructors invoked in 'emplace' and 'emplace_explicit' + // can and will recursively call 'emplace' to construct their sub-objects. + template T *emplace(Args &&...args); + // variant of 'emplace' which can use the 'emplace_explicit' call to avoid + // instantiating the constructor twice + template T *emplace0(Deserz &dctx); + // (this is defined with 'template' args, only so it can be declared here without + // forward refs. All are pass-by-value. Only one specialization will be defined). + template void *emplace_explicit(Deserz &dctx, FI, FD, SA); + // array allocation, used to make all arrays in crate during deserialize. + template T *alloc_array(size_t n); + + private: + // reserve the specified data in the range, and return pointer to start; or + // return null if not possible. + void *do_alloc(size_t align, size_t amount); +}; + +// defines the encoding in the upper 3 bits of the last word of a 'multi-word' supplemental record +// all must be 4..7, since a 0 in the msb indicates a 'short' record. + +constexpr unsigned SUPPFIXUP_CAT_tensor = 4; +constexpr unsigned SUPPFIXUP_CAT_sharedobj = 5; +constexpr unsigned SUPPFIXUP_CAT_blocktable = 6; // with indices packed in one word +constexpr unsigned SUPPFIXUP_CAT_blocktable_full = 7; // .. in two words +constexpr unsigned SUPPFIXUP_CAT_SHIFT = 29u; + +bool fixup_encode_for_blocktable(runlist_fixup_state &seginfo, uint32_t idx, uint32_t table_offs, void **ptrloc); + +// high-level operations in the 'deserialize by segments' code. + +GraphStatus do_multiseg_deser(Deserializer &dctx, size_t ref_deser_pos); +GraphStatus segmentjob_deserialize_ops(Deserializer &dctx, unsigned segno, unsigned threadno); +GraphStatus segmentjob_process_fixups(Deserializer &dctx, unsigned segno, unsigned threadno); +GraphStatus segmentjob_compile_ops(Deserializer &dctx, unsigned segno, unsigned threadno); +void resolve_chunk_preload_after_multiseg_deser(Deserializer &dctx); + +} // namespace hnnx + +#endif // DESER_CONCURRENT_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h new file mode 100755 index 0000000000000..3d72ed7d2de71 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h @@ -0,0 +1,97 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DESER_CONCURRENT_DEFS_H +#define DESER_CONCURRENT_DEFS_H 1 + +#include +#include + +namespace hnnx { + +// NOTE: this file contains defs for concurrent deserialize which are needed on both decode and prepare +// side; mostly just the format of the Aux Data records. +// Defs needed only on decode side are in 'deser_concurrent.h', which #includes this file. + +constexpr unsigned DesConcur_MIN_SEGMENTS = 8; // can't have less than this number. + +// This is the number of runlist slots in the runlist_auxdata_seg_desc format. +// It must be >= the actual number. This number is coded into the start of the AuxData +// payload. If the number gets bigger, the reader of the aux-data +// record will need to be able to cope with the older, smaller value. + +constexpr unsigned DesConcur_MAX_RUNLISTS = 4; + +// The 'Aux Data' record describing the runlist partition has a payload formed of +// a runlist_auxdata_header, followed immediately by N+1 of runlist_auxdata_seg_desc. +// The number N is in the header; there may be additional words after, which can be +// ignored +// +// Aux Data header record. +// The 'record_version' is reserved to flag changes in the format, so that +// if it changes, new skel can understand old records. +// Currently, It has this format; most changes will expand one of the fields +// so following this may be adequate to capture version changes; if it is not, +// add flags in the upper bits. +// bits 31 ..13 : reserved, 0 +// bit 12: set of crate sizes are calculated based on 'dynamic tensor' sizes +// bits 11..8 length of the header in uint32's +// bits 7..3 length of 'segment' record, in uint32's +// bits 2..0 .. value of DesConcur_MAX_RUNLISTS +// +struct runlist_auxdata_header { + unsigned record_version; // see above + unsigned numsegs : 16; // number of segments; >= 8, likely <= 64 but who knows + unsigned hdrflags : 16; // reserved for flags + unsigned runlist_offset; // see below +}; + +// 'runlist_offset' is the offset, in u32's units, from the 'num_in_tensors' word +// to the 'n_ops_total' word. This is needed by 'weight share' processing in order to +// adjust the deser_offset values to accommodate changes in the encoding length of pointers. + +// The N segments are described by an array of N+1 of runlist_auxdata_seg_desc; +// segment i is defined by arr[i] (start) and arr[i+1] (end). +// An exception is 'crate_seg_len'- this may be less than arr[i+1].crate_offset - arr[i].crate_offset +// due to padding. +// In the final record arr[N]: +// - crate_seg_len is not used (0) +// - The *_list_posn records are the total length of the runlists +// - the four 'base_*_index' values are all 1 greater than any index used in the graph +// +struct runlist_auxdata_seg_desc { + uint32_t deser_offset; // where the input (pickle) data begins - reference point is the start of 'Runlist' as + // // defined in docs/pickle_format.md, i.e. the location of 'n_ops_total' word + uint32_t crate_offset; // offset in crate + uint32_t crate_seg_len; // crate length needed (not used in final entry) + uint32_t runlist_posn[DesConcur_MAX_RUNLISTS]; // where the segment starts in Op* runlist + uint32_t execlist_posn[DesConcur_MAX_RUNLISTS]; // where the segment starts in 'execlist' + uint32_t base_opseq_index; // first 'op_sequence_marker' index used in the segment. + uint32_t base_tensor_index; // first tensor index defined this segment + uint32_t base_blocktable_index; // first blocktable index defined in this segment + uint32_t base_sharedobj_index; // first 'shared_object' index defined in this segment +}; + +// Bit in the header version indicating crate sizes allow for 'dynamic shapes'. +// NOTE: if that gets backed out later, leave this here but remove it from DesConcur_AUXDATA_REC_VERSION +// +constexpr unsigned DesConcur_AUXDATA_REC_VERSION_DYNSHAPE_SIZES = 4096; + +constexpr unsigned DesConcur_AUXDATA_REC_VERSION = // composed of: + ((sizeof(runlist_auxdata_header) / sizeof(uint32_t)) * 256 // header size + + (sizeof(runlist_auxdata_seg_desc) / sizeof(uint32_t)) * 8 // seg desc len + + DesConcur_MAX_RUNLISTS) | + DesConcur_AUXDATA_REC_VERSION_DYNSHAPE_SIZES; + +// values to be used to 'grow' old crate estimate to compensate for 'dyn shape' mismatch +constexpr unsigned DesConcur_CrateGrowPerTensor = 2; // number of words per 'tensor' +constexpr unsigned DesConcur_CrateGrowPerShared = 2; // number of words per 'shared object' + +} // namespace hnnx + +#endif // DESER_CONCURRENT_DEFS_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h new file mode 100755 index 0000000000000..43f14039fd1ad --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h @@ -0,0 +1,68 @@ +//============================================================================== +// +// Copyright (c) 2021-2023 Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DESERIALIZE_TENSORS_H +#define DESERIALIZE_TENSORS_H 1 + +#include +#include +#include +#include +#include +#include +#include "limits.h" +#include "log.h" + +#include "forward_classes.h" +#include "serdes_tensors.h" + +namespace hnnx { + +// see comment in serdes_tensors.h for overview of how this works. + +class Deserializer; + +class DeserTensorConn : public SerTensorConnDefs { + typedef unsigned tensor_idx; + typedef Tensor const *ptr_type; + + // this collects all of the tensor_def we have seen. index is seq_index-1. + std::vector defined_tensors; + + public: + DeserTensorConn() {} + // process a tensor definition + void tensor_def(Deserz &, ptr_type); + // process n tensor refs. + void tensor_refs(Deserz &, ptr_type *ptrs, unsigned num); + // process a tensor ref + void tensor_ref(Deserz &dctx, ptr_type &ptr) { tensor_refs(dctx, &ptr, 1); } + + // TODO: remove these two, we don't use them, and should not. + // read an identity (for use in subsequent need_fixup) + tensor_idx read_identity(Deserz &); + // apply the identity to 'fix' a tensor pointer (usually now, sometimes later + void need_fixup(tensor_idx ident, ptr_type *dst); + + // 'reserve' the defined tensors to avoid allocation overhead... + inline void reserve_tensors(const size_t n) { defined_tensors.reserve(n); } + // resize the 'defined tensors' table to its full capacity (specified). + // Used only in multi-thread deserialize, prior to deserializing the runlist. + inline void resize_tensordef_table(const size_t n) { defined_tensors.resize(n); } + + // this is for use by 'reference fixup' code, in concurrent deserialize. + std::vector const &get_defined_tensors() const { return defined_tensors; } + + protected: + tensor_idx read_identity_inline(Deserz &); + void apply_fixup_inline(tensor_idx idx, ptr_type *dst); +}; + +} // namespace hnnx + +#endif // DESERIALIZE_TENSORS_H diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h new file mode 100755 index 0000000000000..7312ae8bdd948 --- /dev/null +++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h @@ -0,0 +1,761 @@ +//============================================================================== +// +// Copyright (c) Qualcomm Technologies, Inc. +// All Rights Reserved. +// Confidential and Proprietary - Qualcomm Technologies, Inc. +// +//============================================================================== + +#ifndef DESERIALIZER_H +#define DESERIALIZER_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "limits.h" +#include "dtype.h" +#include "log.h" +#include "allocator.h" +#include "op_extra_info.h" + +#include "serialize_defs.h" +#include "forward_classes.h" +#include "deserialize_tensors.h" +#include "macros_attribute.h" +#include "const_extent_descriptor.h" +#include "weak_linkage.h" +#include "size_align_code.h" +#include "deser_concurrent.h" +#include "hexagon_nn_types.h" + +namespace hnnx { +class DMA_Manager; +class Crate; +/** + * @brief \ref Serializer and \ref Deserializer modules that provides + * a mechanism to flatten (serialize) and reconstruct (deserialize) + * primitive and user-defined data types. The initial objective + * was to create an in-memory representation of the optimized + * \ref Graph on x86 which can then be reconstructed and executed on + * a qdsp target, essentially, a means to Graph caching. + * + */ +using tensor_deserializer_fn = uptr_Tensor (*)(Deserz &); + +using deserialize_op_func = void *(*)(void *, Deserz &); // Allocation function +using deserialize_dtor_func = void (*)(Graph *, void *); // Deallocation function +class SimpleOpBase; +using deserialize_make_unique = std::unique_ptr (*)(); + +struct op_deserializer_fn { + op_deserializer_fn(deserialize_op_func init_func_in, const size_align_code_t sizeal_in) + : init_func(init_func_in), size_align_code(sizeal_in) + { + } + op_deserializer_fn(deserialize_op_func init_func_in, deserialize_dtor_func dtor_func_in, + const size_align_code_t sizeal_in) + : dtor_func(dtor_func_in), init_func(init_func_in), size_align_code(sizeal_in){}; + op_deserializer_fn(const op_deserializer_fn &) = default; + op_deserializer_fn(op_deserializer_fn &&) = default; + op_deserializer_fn &operator=(const op_deserializer_fn &) = delete; + deserialize_dtor_func dtor_func = nullptr; + deserialize_op_func init_func = nullptr; + const size_align_code_t size_align_code{}; + inline constexpr size_t get_size() const { return size_align_code.size(); } + inline constexpr size_t get_align() const { return size_align_code.align(); } +}; + +// here's a quick and dirty way to make these maps go faster: compare string_view starting with len; +// and if the len is the same, then compare the middle character, and if that's the same, +// use memcmp. This avoids getting slowed down by a lot of long common prefixes in the type names. +// and we don't care about the weird ordering it generates. +// +struct trick_stringview_lt { + bool operator()(std::string_view const &a, std::string_view const &b) const + { + unsigned const na = a.size(); + unsigned const nb = b.size(); + if (na != nb) return na < nb; + char const *const pa = a.data(); + char const *const pb = b.data(); + if (pa == pb || na == 0) return false; // pa==pb is a common case. + unsigned const char_a = pa[na >> 1]; + unsigned const char_b = pb[na >> 1]; + if (char_a != char_b) return char_a < char_b; + return ::memcmp(pa, pb, na) < 0; + } +}; + +using op_deserializer_map_t = std::map, trick_stringview_lt>; +using tensor_deserializer_map_t = std::map; +using cexdesc_deserializer_map = std::map; + +using const_extent_t = std::pair; +using weight_buf_deserializer_map = std::map; + +/** + * @brief Deserializer class to reverse the serialization + * process and reconstruct the data for specific types + * + */ +class Deserz : public DeSerError { + friend class Deserializer; // weirdly, sometimes a derived class needs to be a friend. + friend class DeserTensorConn; + + protected: + Deserz(Deserializer *full_deser, char const *p, size_t n, Graph *g = nullptr); + + public: + // I want to make this protected, but can't. + // Even code which has access to a protected copy_ctor + // of foo can't invoke .resize(n, foo_inst) on a std::vector. This + // seems like a defect in C++. Applies to various 'emplace' methods too; + // the 'emplace' can only ever use public ctors. + Deserz(Deserz const &) = default; + + public: + virtual ~Deserz(); // please keep this as first virtual method declared. + + // These three ONLY TO BE USED when setting up a Deserz to start processing a segment. + void setup_source_span(deser_segment_span const &); + void setup_dcrate_out(void *base, size_t len); + void setup_next_tensor_index(unsigned const idx) { next_tensordef_index = idx; } + + typedef uint32_t object_identity_type; + + // Note, various accessor methods are defined as inlines below 'class Deserializer'. + // true if this Deserz is really an instance of Deserializer. + constexpr bool is_base_deser() const; + + using op_deserialize_fn_list_t = std::vector; + using tensor_deserialize_fn_list_t = std::vector; + + op_deserialize_fn_list_t &get_op_deserialize_fn_list(); + tensor_deserialize_fn_list_t &get_tensor_deserialize_fn_list(); + std::vector &get_blocktable_link_table(); + // when deserializing an op: + // - call deserialize_tensor_ref (or _refs) on all the input tensor pointers + // - pass all output tensor addresses to deserialize_tensor_def + // Sequence must match serialization; note that the deserialize-ctor of Tensor + // calls deserialize_tensor_def on itself; so there is no need to call it elsewhere, + // except for specialized types which are constructed otherwise during depickle (e.g., + // types embedded in the Op). + // + // Some ops have multiple copies of some input tensor pointers; for these, it's possible + // serialize just one reference, and the deserialize it using + // auto id = deserialize_object_identity() // <- corresponds to serialize_tensor_ref + // need_tensor_fixup( id, &first_tensor_pointer); + // (other deserialize activity can happen here) + // need_tensor_fixup( id, &second_tensor_pointer); + + void deserialize_tensor_def(Tensor const *tensor_ptr); + void deserialize_tensor_ref(Tensor const *&where); + void deserialize_tensor_refs(Tensor const **ptrs, unsigned n); + template void deserialize_tensor_ref(T const *&where); + template void deserialize_tensor_refs(T const **ptrs, unsigned n); + object_identity_type deserialize_object_identity(); + void need_tensor_fixup(object_identity_type oid, Tensor const **where); + + Graph &graph() const { return *graph_ptr; } + Crate *crate() { return d_crate.crate(); } + DCrate *dcrate() { return &d_crate; } + DeserSegDescs const &get_segments() const; // gets ref to associated 'segments' object + op_deserializer_map_t const &get_op_deser_map() const { return *op_deserializer_map; } + + bool is_aligned_const_format() const; + bool has_pending_tensor_updates(); + + bool is_shared_dynamic_tensor_shape_format() const; + + fa::RuntimeAllocator *allocator; + DCrate d_crate; // contains a crate pointer + + protected: + // hoist pointers to these maps into Deserializer to avoid static lock overhead + op_deserializer_map_t const *op_deserializer_map; + tensor_deserializer_map_t const *tensor_deserializer_map; + Graph *graph_ptr{}; + Deserializer *full_deser; + + char const *bufstart; // start of current buffer + char const *bufend; // first byte we can't read + char const *bufp; // next to read + char const *buf_limit; // <= bufend; where 'fill_buffer' needs to be called. + size_t bytes_filled; // bytes previously filled + + uint32_t op_flags; + OpExtraInfo op_extra_info; + + unsigned next_tensordef_index = 1; // belongs to 'tensorconn' but needs to be in Deserz. + // 'format version'. Currently only ones used are 0 = classic, 1 = July/2023 + // Only access through methods like .classic_format(); + // This is changed to non-zero value based on seeing certain Aux Data records + // (which must appear before the allocator). + int format_version = 0; + + // this is used in multi-thread decoding. It is important that + // it remains null-constructed if the object is really a base of Deserializer; + // it is only used in 'segment' Deserz instances. + runlist_fixup_state seg_fixup_state{}; + + /** + * @brief throws an error since deserializer detected + * deserialization on insufficient bytes i.e. an underflow + * + */ + API_EXPORT virtual char const *fill_buffer(); // called for underflow on short operation + + /** + * @brief Deserialize data of specified length and write into + * buffer provided by caller + * + * @param[out] p buffer to write to + * @param[in] len length of the \ref bufp to read from + * @param[in] align if true, skip input bytes to a boundary of 4 + */ + API_EXPORT virtual void deserialize_fread(void *p, size_t len, bool align); + + /** + * @brief Get current position of buffer from which next data will be read + * + * @return size_t offset from buffer start + */ + size_t buffer_offset() const { return bufp - bufstart; } + /** + * @brief Available buffer size remaining for deserialization + * + * @return size_t remaining bytes size + */ + size_t buffer_remain() const { return bufend - bufp; } + + /** + * @brief deserialize buffer for type T + * + * @retval T returs the deserialized value of type T + * + * Note: This is the templated API called by deserialize_T() functions + * + * Note: Cannot be used for more than 4 bytes, there is a specialized version to read u64. + */ + template T simple_deserialize() + { + static_assert(sizeof(T) <= 4, "can only read sizeof(T) <= 4"); + constexpr size_t W = 4; + char const *curr_p = bufp; + if (curr_p >= buf_limit) { + curr_p = fill_buffer(); + } + T const val = *(T const *)(curr_p); + bufp = curr_p + W; + return val; + } + // see comment above deserialize_shared_obj. + API_EXPORT std::pair deserialize_shared_obj_func(void const **ptrloc); + API_EXPORT uint64_t deser_u64_slowpath(); + void initial_l2fetch(); // called only from ctor + + public: + inline constexpr bool classic_format() const { return format_version == 0; } + /** + * @brief deserialize data of type which calls simple_deserialize + * + * @param val data to deserialize + * + * Note: the below are the only types supported for deserialize_type + */ + API_EXPORT uint64_t deserialize_uint64(); // inline later + inline float deserialize_float() { return simple_deserialize(); } + inline uint32_t deserialize_uint32() { return simple_deserialize(); } + inline NN_INT32_T deserialize_int32() { return simple_deserialize(); } + inline int16_t deserialize_int16() { return simple_deserialize(); } + inline uint16_t deserialize_uint16() { return simple_deserialize(); } + inline int8_t deserialize_int8() { return simple_deserialize(); } + inline uint8_t deserialize_uint8() { return simple_deserialize(); } + + inline uint64_t deserialize_namesig() { return deserialize_uint64(); } + + // note, this is defined as an inline in deserializer.cc and not available elsewhere + tensor_deserializer_fn deserialize_tensor_identification(unsigned tensor_class_index); + + // deserialize string + // **NOTE** will throe runtime error if called in a Deserz which is not really a Deserialize. + API_EXPORT std::string_view deserialize_str(); + + uint32_t get_op_flags() const { return op_flags; }; + void clear_op_flags() { op_flags = 0; }; + void set_op_flags(uint32_t f) { op_flags = f; }; + + const OpExtraInfo &get_op_extra_info() const { return op_extra_info; }; + void clear_extra_info() { op_extra_info.clear(); }; + void set_op_extra_info(OpExtraInfo in_op_extra_info) { op_extra_info = in_op_extra_info; }; + + /** + * @brief deserialize buffer for specified size + * + * @param[in] alloc_size number of bytes to read from \ref bufp + * @param[out] ptr destination buffer for the read bytes + * @return size_t number of bytes actually read + */ + API_EXPORT size_t deserialize_buf(size_t alloc_size, void *ptr); + /** + * @brief similar to deserialize_buf but first deserialize a + * uint32_t size of bytes that should match the alloc_size + * + * @param[in] alloc_size number of bytes to read from \ref bufp + * @param[out] ptr destination buffer for the read bytes + * @return size_t number of bytes actually read + */ + API_EXPORT size_t deserialize_buf_withlen(size_t alloc_size, void *ptr); + // deserialize a pointer as 64 bits + inline void *deserialize_ptr() { return (void *)size_t(deserialize_uint64()); } + + template T deserialize_type(); + + template std::array deserialize_array(); + + /** + * @brief convernience wrappers for deserialize fuctions that + * take in different number of arguments of uint32_t type + * + * @return std::tuple (first, second) uint32_t data deserialized + */ + // convenience wrappers (to reduce inlined code size w/o much loss of speed) + API_EXPORT std::tuple deserialize_uint32_x2(); + API_EXPORT std::tuple deserialize_uint32_x3(); + API_EXPORT std::tuple deserialize_uint32_x4(); + + API_EXPORT void deserialize_uint32_arr(uint32_t *p, size_t N); + + // to reduce code size in the templates, we can deserialize arrays of + // N uint32 to sizet + API_EXPORT void deserialize_uint32_arr_sizet(size_t *p, size_t N); + + /** + * @brief deserialize array containing uint32_t type date + * + * @tparam N size of the array + * @return std::array array containing the deserialized values + */ + template std::array deserialize_uint32_array_sizet() + { + std::array res; + deserialize_uint32_arr_sizet(&res[0], N); + return res; + } + + // + // This is used for shared objects like Shape and Interface. + // it deserializes the index, and decides if it's the first instance. + // - must always pass the address which needs to point to it; though it + // will be not be set by this function. + // - if retval.second is null, then the object was previously deserialized, + // and return.first is the pointer to it. + // - otherwise, caller must deserialize the instance, and store the pointer + // at *retval.second. retval.first will be null in this case. + // In scenarios where delayed resolution is used, the return may be {token,null} + // where 'token' is actually delayed resolution token. + // + template + std::pair // see above + deserialize_shared_obj(T const **const loc) + { + auto const res = deserialize_shared_obj_func((void const **)loc); + return {(T const *)res.first, (T const **)res.second}; + } + + // Increment tue current read position of internal buffer without reading anything + void deserialize_skip_words(size_t nwords); + + // Apply the 'pointer fixups' contained within seg_info. This can + // be called with 'this' being any Deserz or Deserializer associated + // with the operation (it is only used to access tables in Deserializer). + // This can only be done on a given segment when all previous have + // been deserialized; so if we have one Deserz per thread, we need + // to 'move' the seg_info object out of it after completing the segment, + // and use this later to do the fixups. + // Returns true if ok, false if failed. + // Will leave the fixup list empty on success. + bool apply_segment_fixups(runlist_fixup_state &seg_info) const; + + // Methods to move 'seg_fixup_state' object in or out. + void install_seg_fixup_state(runlist_fixup_state &&src) { seg_fixup_state = std::move(src); } + runlist_fixup_state extract_seg_fixup_state() { return std::move(seg_fixup_state); } + void extract_seg_fixup_state_to(runlist_fixup_state &dest) { dest = std::move(seg_fixup_state); } + + // and a read_only accessor + runlist_fixup_state const &fixup_state() const { return seg_fixup_state; } + + // for Tensor::deserialize_blocktable + inline bool fixup_encode_for_blocktable(uint32_t const idx, uint32_t const table_offs, void **const ptrloc) + { + return hnnx::fixup_encode_for_blocktable(seg_fixup_state, idx, table_offs, ptrloc); + } +}; + +///////////////// + +class Deserializer : public Deserz { + friend class Deserz; + + public: + /** + * @brief Construct a new Deserializer object + * + * @param[in] p buffer that needs to be deserialized + * @param[in] n length of the buffer + * @param[in] g pointer Graph object to deserialize (usually null, since object + * is being passed to the Graph::Graph ctor to deserialize; that ctor + * must immediately call dctx.set_graph(*this) ) + */ + API_EXPORT Deserializer(char const *p, size_t n, Graph *g = nullptr); + API_EXPORT virtual ~Deserializer(); // please keep this as first virtual method declared. + + void set_graph(Graph &g); + + inline void deserialize_tensor_def(Tensor const *tensor_ptr) { tensorconn.tensor_def(*this, tensor_ptr); } + inline void deserialize_tensor_ref(Tensor const *&where) { tensorconn.tensor_ref(*this, where); } + inline void deserialize_tensor_refs(Tensor const **ptrs, unsigned n) { tensorconn.tensor_refs(*this, ptrs, n); } + inline void deserialize_pred_conditions(std::vector &pred_cond_list) + { + // get the number of items in the vector + uint32_t num_of_objects = deserialize_uint32(); + assert(num_of_objects <= UINT32_MAX); + if (num_of_objects > 0) { + pred_cond_list.resize(num_of_objects); + + // TODO: remove this once we know how to update it at runtime + // Currently setting it to true + pred_cond_list.at(0) = 1; + } + } + template inline void deserialize_tensor_ref(T const *&where) + { + static_assert(std::is_base_of::value); + tensorconn.tensor_ref(*this, *(Tensor const **)&where); + } + template void deserialize_tensor_refs(T const **ptrs, unsigned n) + { + static_assert(std::is_base_of::value); + tensorconn.tensor_refs(*this, (Tensor const **)ptrs, n); + } + inline object_identity_type deserialize_object_identity() { return tensorconn.read_identity(*this); } + + inline void need_tensor_fixup(object_identity_type oid, Tensor const **where) { tensorconn.need_fixup(oid, where); } + inline void resolve_fixups() + { + [[maybe_unused]] const object_identity_type newval = tensorconn.read_identity(*this); + assert(newval == 0); + } + + constexpr bool is_aligned_const_format() const { return aligned_const_format_flag; } + void set_aligned_const_format(const bool v = true) { aligned_const_format_flag = v; } + + constexpr bool is_shared_dynamic_tensor_shape_format() const { return shared_dynamic_tensor_shape; } + void set_shared_dynamic_tensor_shape_format(const bool v = true) { shared_dynamic_tensor_shape = v; } + + PUSH_WARNING() + DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV) + // valid when the entire pickle, in const_extent format, is loaded as a single, persistent dma buffer + inline unsigned char *get_weight_pointer() { return ((unsigned char *)bufstart) + (4 * pickle_len_words); }; + POP_WARNING() + inline size_t get_weight_size() { return (bufend - bufstart) - (4 * pickle_len_words); }; + + inline op_deserialize_fn_list_t &get_op_deserialize_fn_list() { return op_deserialize_fn_list; } + inline tensor_deserialize_fn_list_t &get_tensor_deserialize_fn_list() { return tensor_deserialize_fn_list; } + + // Next 4 methods are used to support 'deserialize_by_segments'. + // 'get_forward_span' returns a 'deser_segment_span' (pair of pointers) for a region of deserialized data + // from 'ref + start' up to 'ref + end', where start and end (0 <= start < end) are byte offsets + // relative to some position 'ref' in the deserialized data, and 'ref' is the value which bytes_consumed() + // returned at that reference point. All should be multiples of 4. + deser_segment_span get_forward_span(size_t ref, size_t start, size_t end); + // used to get a reference point for bytes_consumed + size_t bytes_consumed() const { return bufp - bufstart; } + // used to skip past the last 'get_forward_span' we did + void skip_to_after_span(deser_segment_span const &); + // resize tables: tensor, shared_obj, linktable, according to info in final_segdesc + void resize_object_tables(runlist_auxdata_seg_desc const &final_desc); + + uint32_t crate_size_according_to_segments() const; + + protected: + std::vector objindex; // index of pointers to shape, etc. + // the state of the 'tensor connectivity' deserialize engine. + DeserTensorConn tensorconn; + bool aligned_const_format_flag = false; + bool shared_dynamic_tensor_shape = false; + + // this is used in 'deserialize_str', so it ideally should be in Deserz; but + // it's pretty large; so, put it here and forbid calling deserialize_str + // on a Derserz which not really a Deserialize. We only use it to decode + // 'classic' pickles, so this is ok. + char name_buf[4096]; // used for string view + + // do the reference fixups on a segment. Return true if OK. + // See Deserz::apply_segment_fixups for public API. + static bool do_segment_fixups(runlist_fixup_state &seginfo, Deserz const &dctx0); + + public: + inline constexpr bool classic_format() const { return format_version == 0; } + inline void set_format_2307() { format_version = 1; } + + // This is called when a 'class index' Aux Data is encountered. + // It must deserialize exactly the indicated number of payload words. + // is_tensor = false for "Co" (op class index), and true for "Ct" (tensor class index) + API_EXPORT void auxdata_class_index(unsigned payload_words, bool is_tensor); + // + // called when an 'Nt' Aux data is encountered, which provides some array sizes for the + // deserialization. + // It must deserialize exactly the indicated number of payload words. + API_EXPORT void auxdata_temparr_sizes(unsigned payload_words); + // Called when a 'AuxTag_deserializeSegments' is encountered. If it likes + // the record, it will set up the 'segments' object. + API_EXPORT void auxdata_deserialize_segments(unsigned payload_words); + + // called when a 'KS' Aux data is encountered, which provides a const_extent_descriptor + // It must deserialize exactly the indicated number of payload words. + API_EXPORT int auxdata_read_const_extent_descriptor(const unsigned payload_words); + // helper for above. payload_words is the length WITH PADDING + API_EXPORT int extract_const_extent_name(const unsigned payload_words, std::string &retVal); + + // Extract a std::vector containing the 'const extent descriptor table, + // from a given offset (in units of 32-bit words) relative to the start of the pickle. + // or separate pointer (if separate buffer for the weights was passed in). + // This does not affect the current position. + // If there is a problem, it returns an empty vector; caller *must* check and report. + // This uses hnnx::const_extent_hdr_check to understand how much it should read, + // and to do basic check. + API_EXPORT std::vector extract_const_extent_table(size_t posn_in_words); + std::vector extract_const_extent_table(hexagon_nn_wide_address_const_t weight_data, + const size_t weight_size); + // given a destination char pointer, prefilled with \null, fills it in with the name of the const_extent + // caller must provide destination of sufficient length + std::string name_from_weight_data(hexagon_nn_wide_address_const_t weight_data, const uint32_t weight_length); + // helper func for above. return -1 if name not present. + std::string get_name(hexagon_nn_wide_address_const_t weight_data, const uint32_t weight_length); + // give a vector of weight_data buffers, stores them all in the appropriate map + void store_named_weight_bufs(const hexagon_nn_wide_address_const_t *const buffers, const uint64_t *const lengths, + const unsigned num_buffers); + // + // copy 'len' bytes of data at offset offs_bytes in the pickle into location dstp. + // returns true if it's possible. You can maybe pass a DMA_Manager to have it queued... + // offs_bytes defined as uint64_t to support possible 'far' data on hexagon. + API_EXPORT bool extract_const_extent_data(uint64_t offs_bytes, size_t len, void *dstp, DMA_Manager *dma = nullptr); + // same, using an external const_extent + bool extract_const_extent_data(uint64_t offs_bytes, size_t len, void *dstp, + hexagon_nn_wide_address_const_t weight_data, const size_t weight_length); + + // This extracts the 'objindex', when it is needed e.g. to 'patch' interfaces. + // Must be done only after deserializing, and can only be done once. + std::vector extract_objindex() { return std::move(objindex); } + + DeserSegDescs segments; // array of runlist_seg_descriptor, empty if not doing multiseg. + + // this is used to pass the offset of the const-extent-descriptor (recorded as pickle_len) + // to the alloc->deserialize. + size_t pickle_len_words; + + // OPTIONAL maps from weight buffer names to the descriptors and the buffers, respectively + cexdesc_deserializer_map named_cexdescs; + weight_buf_deserializer_map named_weight_bufs; + + void *uncached_ptr; + uint32_t uncached_len; + + std::vector op_deserialize_fn_list; + std::vector tensor_deserialize_fn_list; + + // used to 'link' shared blocktables during deser. + std::vector blocktable_link_table; +}; + +///////////////// + +// true if this Deserz is really an instance of Deserializer. +inline constexpr bool Deserz::is_base_deser() const +{ + return static_cast(full_deser) == this; +} + +inline bool Deserz::is_aligned_const_format() const +{ + return full_deser->aligned_const_format_flag; +} +inline bool Deserz::is_shared_dynamic_tensor_shape_format() const +{ + return full_deser->shared_dynamic_tensor_shape; +} +inline Deserz::op_deserialize_fn_list_t &Deserz::get_op_deserialize_fn_list() +{ + return full_deser->op_deserialize_fn_list; +} +inline Deserz::tensor_deserialize_fn_list_t &Deserz::get_tensor_deserialize_fn_list() +{ + return full_deser->tensor_deserialize_fn_list; +} +inline std::vector &Deserz::get_blocktable_link_table() +{ + return full_deser->blocktable_link_table; +} +// For these in Deserz, we must call the corresponding methods on the +// tensorconn in 'full_deser', but must pass 'this' as first parameter. +inline void Deserz::deserialize_tensor_def(Tensor const *const tensor_ptr) +{ + full_deser->tensorconn.tensor_def(*this, tensor_ptr); +} +inline void Deserz::deserialize_tensor_ref(Tensor const *&where) +{ + full_deser->tensorconn.tensor_ref(*this, where); +} +inline void Deserz::deserialize_tensor_refs(Tensor const **const ptrs, const unsigned n) +{ + full_deser->tensorconn.tensor_refs(*this, ptrs, n); +} +inline DeserSegDescs const &Deserz::get_segments() const +{ + return full_deser->segments; +} + +// unaligned read of 64-bits (two 32-bit aligned reads) +template <> inline uint64_t Deserz::simple_deserialize() +{ + char const *const curr_p = bufp; + if (curr_p + 8u > buf_limit) { + return deser_u64_slowpath(); + } + uint32_t const *const p = (uint32_t const *)(curr_p); + bufp = curr_p + 8u; + return p[0] + ((uint64_t)p[1] << 32); +} +inline uint64_t Deserz::deserialize_uint64() +{ + return simple_deserialize(); +} + +template <> inline uint64_t Deserz::deserialize_type() +{ + return deserialize_uint64(); +} +template <> inline float Deserz::deserialize_type() +{ + return deserialize_float(); +} +// sometimes uint32_t is unsigned long, sometimes it's unsigned +// sometimes unsigned long is uint64. Hopefully this should cover it all. +#if ULONG_MAX == UINT_MAX +template <> inline unsigned long Deserz::deserialize_type() +{ + return deserialize_uint32(); +} +template <> inline long Deserz::deserialize_type() +{ + return deserialize_int32(); +} +#endif +template <> inline unsigned Deserz::deserialize_type() +{ + return deserialize_uint32(); +} +template <> inline int Deserz::deserialize_type() +{ + return deserialize_int32(); +} +template <> inline int16_t Deserz::deserialize_type() +{ + return deserialize_int16(); +} +template <> inline uint16_t Deserz::deserialize_type() +{ + return deserialize_uint16(); +} +template <> inline int8_t Deserz::deserialize_type() +{ + return deserialize_int8(); +} +template <> inline uint8_t Deserz::deserialize_type() +{ + return deserialize_uint8(); +} + +// assert( dctx.deserialize_uint32() == SOME_CONST ); +// is not safe, since if you turn off asserts, it will no longer read the 4 bytes. This is to allow that to work +#define DESERIALIZE_ASSERT_UINT32(DCTX, VAL) \ + do { \ + uint32_t const tmp [[gnu::unused]] = (DCTX).deserialize_uint32(); \ + assert(tmp == (VAL)); \ + } while (0) + +#include "weak_linkage.h" +PUSH_VISIBILITY(default) + +/** + * @brief register the deserialization function for each \ref Op + * TypicalOp and VariadicOp derived classes are instantiated via + * template and hence the need to create a map of deserialize functions + * for each Op when they are generated at library initialization + * + * @param[in] tinf Op type_info that is used to key the map + * @param[in] fn Deserialize function + */ +API_EXPORT void deserialize_op_register(std::type_info const *tinf, const std::string_view type_tag, + const op_deserializer_fn &fn, bool is_external = false); +/** + * @brief register the deserialization function for each \ref Tensor + * Since \ref Tensor derived classes are instantiated via templates, there + * is a need to create a map of deserialize function for each Tensor at runtime + * + * @param[in] type_tag Tensor type tag that is used to key the map + * @param[in] fn Deserialize function + */ +API_FUNC_EXPORT void deserialize_tensor_register(std::type_info const &tinf, const char *type_tag, + tensor_deserializer_fn fn); + +POP_VISIBILITY() + +// this is fully defined in serialize_register.h +template struct deserialize_tensor_using_constructor; + +// this is fully defined in serialize_register.h +template struct alloc_func_for_op; +template struct dealloc_func_for_op; + +////////////////////// +// Forward decls of things defined in template_help.h +// +// contains_type< tuple, x >::value: true if x is in a,b,c ... +// no 'remove ref' etc is done. +template struct contains_type; +template struct not_contains_type; +template